//
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates
// <open-source-office@arm.com>
//
// SPDX-License-Identifier: Apache-2.0
//

#ifdef FBGEMM_ENABLE_KLEIDIAI

#include "./KleidiAIFP32UKernelsNeon.h"

namespace kleidiai {

void NOINLINE gemmkernel_1x2_Neon_fp32_fA0fB0fC0(GemmParamsFP32* gp) {
#ifdef __aarch64__
  __asm__ __volatile__(
      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
      "mov x25, #0x1\n"
      "ldr x24, [%x[gp], %[offsetof_b_block_cols]]\n"
      "ldr x23, [%x[gp], %[offsetof_B]]\n"
      "ldr x22, [%x[gp], %[offsetof_C]]\n"
      "fcmp s16, #0.0\n"
      "csel x25, XZR, x25, EQ\n"
      "csel x25, XZR, x25, VS\n"
      "1:" // Height 1: Column loop
      "tbz x25, #0, 2f\n"
      "ldr q28, [x22, #0x0]\n"
      "ldr q29, [x22, #0x10]\n"
      "add x20, %x[gp], %[offsetof_beta]\n"
      "ldr q30, [x22, #0x20]\n"
      "ldr q31, [x22, #0x30]\n"
      "ld1r { v16.4s }, [x20]\n"
      "fmul v28.4s, v28.4s, v16.4s\n"
      "fmul v29.4s, v29.4s, v16.4s\n"
      "fmul v30.4s, v30.4s, v16.4s\n"
      "fmul v31.4s, v31.4s, v16.4s\n"
      "b 3f\n"
      "2:" // Height 1: no accumulate
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "movi v30.16b, #0x0\n"
      "movi v31.16b, #0x0\n"
      "3:" // Height 1: setup done
      "ldr x20, [%x[gp], %[offsetof_A]]\n"
      "ldr x21, [%x[gp], %[offsetof_k]]\n"
      "mov x20, x20\n"
      "cmp x21, #0x4\n"
      "blt 7f\n"
      "ldr q0, [x20, #0x0]\n"
      "ldr q1, [x23, #0x0]\n"
      "cmp x21, #0x8\n"
      "ldr q2, [x23, #0x10]\n"
      "ldr q3, [x23, #0x20]\n"
      "ldr q4, [x23, #0x30]\n"
      "ldr q5, [x23, #0x40]\n"
      "ldr q6, [x23, #0x50]\n"
      "ldr q7, [x23, #0x60]\n"
      "ldr q8, [x23, #0x70]\n"
      "ldr q9, [x23, #0x80]\n"
      "ldr q10, [x23, #0x90]\n"
      "ldr q11, [x23, #0xa0]\n"
      "ldr q12, [x23, #0xb0]\n"
      "ldr q13, [x23, #0xc0]\n"
      "ldr q14, [x23, #0xd0]\n"
      "ldr q15, [x23, #0xe0]\n"
      "ldr q16, [x23, #0xf0]\n"
      "blt 6f\n"
      "5:" // Height 1: Multiply loop: Main loop head
      "fmla v28.4s, v1.4s, v0.s[0]\n"
      "fmla v29.4s, v2.4s, v0.s[0]\n"
      "sub x21, x21, #0x4\n"
      "add x20, x20, #0x10\n"
      "fmla v30.4s, v3.4s, v0.s[0]\n"
      "fmla v31.4s, v4.4s, v0.s[0]\n"
      "cmp x21, #0x8\n"
      "add x23, x23, #0x100\n"
      "ldr q1, [x23, #0x0]\n"
      "ldr q2, [x23, #0x10]\n"
      "ldr q3, [x23, #0x20]\n"
      "ldr q4, [x23, #0x30]\n"
      "fmla v28.4s, v5.4s, v0.s[1]\n"
      "ldr q5, [x23, #0x40]\n"
      "fmla v29.4s, v6.4s, v0.s[1]\n"
      "ldr q6, [x23, #0x50]\n"
      "fmla v30.4s, v7.4s, v0.s[1]\n"
      "ldr q7, [x23, #0x60]\n"
      "fmla v31.4s, v8.4s, v0.s[1]\n"
      "ldr q8, [x23, #0x70]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v28.4s, v9.4s, v0.s[2]\n"
      "ldr q9, [x23, #0x80]\n"
      "fmla v29.4s, v10.4s, v0.s[2]\n"
      "ldr q10, [x23, #0x90]\n"
      "fmla v30.4s, v11.4s, v0.s[2]\n"
      "ldr q11, [x23, #0xa0]\n"
      "fmla v31.4s, v12.4s, v0.s[2]\n"
      "ldr q12, [x23, #0xb0]\n"
      "fmla v28.4s, v13.4s, v0.s[3]\n"
      "ldr q13, [x23, #0xc0]\n"
      "fmla v29.4s, v14.4s, v0.s[3]\n"
      "ldr q14, [x23, #0xd0]\n"
      "fmla v30.4s, v15.4s, v0.s[3]\n"
      "ldr q15, [x23, #0xe0]\n"
      "fmla v31.4s, v16.4s, v0.s[3]\n"
      "ldr q0, [x20, #0x0]\n"
      "ldr q16, [x23, #0xf0]\n"
      "bge 5b\n"
      "6:" // Height 1: Multiply loop: Single iteration only
      "fmla v28.4s, v1.4s, v0.s[0]\n"
      "fmla v29.4s, v2.4s, v0.s[0]\n"
      "add x20, x20, #0x10\n"
      "sub x21, x21, #0x4\n"
      "fmla v30.4s, v3.4s, v0.s[0]\n"
      "fmla v31.4s, v4.4s, v0.s[0]\n"
      "add x23, x23, #0x100\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v28.4s, v5.4s, v0.s[1]\n"
      "fmla v29.4s, v6.4s, v0.s[1]\n"
      "fmla v30.4s, v7.4s, v0.s[1]\n"
      "fmla v31.4s, v8.4s, v0.s[1]\n"
      "fmla v28.4s, v9.4s, v0.s[2]\n"
      "fmla v29.4s, v10.4s, v0.s[2]\n"
      "fmla v30.4s, v11.4s, v0.s[2]\n"
      "fmla v31.4s, v12.4s, v0.s[2]\n"
      "fmla v28.4s, v13.4s, v0.s[3]\n"
      "fmla v29.4s, v14.4s, v0.s[3]\n"
      "fmla v30.4s, v15.4s, v0.s[3]\n"
      "fmla v31.4s, v16.4s, v0.s[3]\n"
      "7:" // Height 1: Multiply loop: Main loop skip
      "cbz x21, 9f\n"
      "8:" // Height 1: Multiply loop: Odd block loop
      "ldr s0, [x20], #0x4\n"
      "ldr q17, [x23, #0x0]\n"
      "sub x21, x21, #0x1\n"
      "ldr q18, [x23, #0x10]\n"
      "ldr q19, [x23, #0x20]\n"
      "ldr q20, [x23, #0x30]\n"
      "add x23, x23, #0x40\n"
      "fmla v28.4s, v17.4s, v0.s[0]\n"
      "fmla v29.4s, v18.4s, v0.s[0]\n"
      "fmla v30.4s, v19.4s, v0.s[0]\n"
      "fmla v31.4s, v20.4s, v0.s[0]\n"
      "cbnz x21, 8b\n"
      "9:" // Height 1: Multiply loop: No odd multiplies
      "prfm pstl1keep, [x22, #0x0]\n"
      "str q28, [x22, #0x0]\n"
      "str q29, [x22, #0x10]\n"
      "str q30, [x22, #0x20]\n"
      "str q31, [x22, #0x30]\n"
      "add x22, x22, #0x40\n"
      "subs x24, x24, #0x1\n"
      "bgt 1b\n"
      :
      : [gp] "r"(gp),
        [offsetof_A] "I"(offsetof(GemmParamsFP32, A)),
        [offsetof_B] "I"(offsetof(GemmParamsFP32, B)),
        [offsetof_C] "I"(offsetof(GemmParamsFP32, C)),
        [offsetof_b_block_cols] "I"(offsetof(GemmParamsFP32, b_block_cols)),
        [offsetof_beta] "I"(offsetof(GemmParamsFP32, beta)),
        [offsetof_k] "I"(offsetof(GemmParamsFP32, k))
      : "cc",
        "memory",
        "v0",
        "v1",
        "v10",
        "v11",
        "v12",
        "v13",
        "v14",
        "v15",
        "v16",
        "v17",
        "v18",
        "v19",
        "v2",
        "v20",
        "v28",
        "v29",
        "v3",
        "v30",
        "v31",
        "v4",
        "v5",
        "v6",
        "v7",
        "v8",
        "v9",
        "x20",
        "x21",
        "x22",
        "x23",
        "x24",
        "x25");
#endif // __aarch64__
}

void NOINLINE gemmkernel_2x2_Neon_fp32_fA0fB0fC0(GemmParamsFP32* gp) {
#ifdef __aarch64__
  __asm__ __volatile__(
      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
      "mov x26, #0x1\n"
      "ldr x25, [%x[gp], %[offsetof_b_block_cols]]\n"
      "ldr x24, [%x[gp], %[offsetof_B]]\n"
      "ldr x23, [%x[gp], %[offsetof_C]]\n"
      "fcmp s16, #0.0\n"
      "csel x26, XZR, x26, EQ\n"
      "csel x26, XZR, x26, VS\n"
      "1:" // Height 2: Column loop
      "tbz x26, #0, 2f\n"
      "ldr q24, [x23, #0x0]\n"
      "ldr q25, [x23, #0x10]\n"
      "add x20, %x[gp], %[offsetof_beta]\n"
      "ldr q26, [x23, #0x20]\n"
      "ldr q27, [x23, #0x30]\n"
      "ld1r { v16.4s }, [x20]\n"
      "ldr x20, [%x[gp], %[offsetof_ldc]]\n"
      "add x20, x23, x20\n"
      "ldr q28, [x20, #0x0]\n"
      "ldr q29, [x20, #0x10]\n"
      "ldr q30, [x20, #0x20]\n"
      "ldr q31, [x20, #0x30]\n"
      "fmul v24.4s, v24.4s, v16.4s\n"
      "fmul v25.4s, v25.4s, v16.4s\n"
      "fmul v26.4s, v26.4s, v16.4s\n"
      "fmul v27.4s, v27.4s, v16.4s\n"
      "fmul v28.4s, v28.4s, v16.4s\n"
      "fmul v29.4s, v29.4s, v16.4s\n"
      "fmul v30.4s, v30.4s, v16.4s\n"
      "fmul v31.4s, v31.4s, v16.4s\n"
      "b 3f\n"
      "2:" // Height 2: no accumulate
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "movi v30.16b, #0x0\n"
      "movi v31.16b, #0x0\n"
      "3:" // Height 2: setup done
      "ldr x21, [%x[gp], %[offsetof_A]]\n"
      "ldr x20, [%x[gp], %[offsetof_lda]]\n"
      "ldr x22, [%x[gp], %[offsetof_k]]\n"
      "mov x21, x21\n"
      "add x20, x21, x20\n"
      "cmp x22, #0x4\n"
      "blt 7f\n"
      "ldr q0, [x21, #0x0]\n"
      "ldr q1, [x20, #0x0]\n"
      "cmp x22, #0x8\n"
      "ldr q2, [x24, #0x0]\n"
      "ldr q3, [x24, #0x10]\n"
      "ldr q4, [x24, #0x20]\n"
      "ldr q5, [x24, #0x30]\n"
      "ldr q6, [x24, #0x40]\n"
      "ldr q7, [x24, #0x50]\n"
      "ldr q8, [x24, #0x60]\n"
      "ldr q9, [x24, #0x70]\n"
      "ldr q10, [x24, #0x80]\n"
      "ldr q11, [x24, #0x90]\n"
      "ldr q12, [x24, #0xa0]\n"
      "ldr q13, [x24, #0xb0]\n"
      "ldr q14, [x24, #0xc0]\n"
      "ldr q15, [x24, #0xd0]\n"
      "ldr q16, [x24, #0xe0]\n"
      "ldr q17, [x24, #0xf0]\n"
      "blt 6f\n"
      "5:" // Height 2: Multiply loop: Main loop head
      "fmla v24.4s, v2.4s, v0.s[0]\n"
      "fmla v28.4s, v2.4s, v1.s[0]\n"
      "sub x22, x22, #0x4\n"
      "add x21, x21, #0x10\n"
      "fmla v25.4s, v3.4s, v0.s[0]\n"
      "fmla v29.4s, v3.4s, v1.s[0]\n"
      "add x20, x20, #0x10\n"
      "cmp x22, #0x8\n"
      "fmla v26.4s, v4.4s, v0.s[0]\n"
      "fmla v30.4s, v4.4s, v1.s[0]\n"
      "add x24, x24, #0x100\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "ldr q2, [x24, #0x0]\n"
      "ldr q3, [x24, #0x10]\n"
      "fmla v27.4s, v5.4s, v0.s[0]\n"
      "fmla v31.4s, v5.4s, v1.s[0]\n"
      "ldr q4, [x24, #0x20]\n"
      "ldr q5, [x24, #0x30]\n"
      "fmla v24.4s, v6.4s, v0.s[1]\n"
      "fmla v28.4s, v6.4s, v1.s[1]\n"
      "ldr q6, [x24, #0x40]\n"
      "fmla v25.4s, v7.4s, v0.s[1]\n"
      "fmla v29.4s, v7.4s, v1.s[1]\n"
      "ldr q7, [x24, #0x50]\n"
      "fmla v26.4s, v8.4s, v0.s[1]\n"
      "fmla v30.4s, v8.4s, v1.s[1]\n"
      "ldr q8, [x24, #0x60]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v27.4s, v9.4s, v0.s[1]\n"
      "fmla v31.4s, v9.4s, v1.s[1]\n"
      "ldr q9, [x24, #0x70]\n"
      "fmla v24.4s, v10.4s, v0.s[2]\n"
      "fmla v28.4s, v10.4s, v1.s[2]\n"
      "ldr q10, [x24, #0x80]\n"
      "fmla v25.4s, v11.4s, v0.s[2]\n"
      "fmla v29.4s, v11.4s, v1.s[2]\n"
      "ldr q11, [x24, #0x90]\n"
      "fmla v26.4s, v12.4s, v0.s[2]\n"
      "fmla v30.4s, v12.4s, v1.s[2]\n"
      "ldr q12, [x24, #0xa0]\n"
      "fmla v27.4s, v13.4s, v0.s[2]\n"
      "fmla v31.4s, v13.4s, v1.s[2]\n"
      "ldr q13, [x24, #0xb0]\n"
      "fmla v24.4s, v14.4s, v0.s[3]\n"
      "fmla v28.4s, v14.4s, v1.s[3]\n"
      "ldr q14, [x24, #0xc0]\n"
      "fmla v25.4s, v15.4s, v0.s[3]\n"
      "fmla v29.4s, v15.4s, v1.s[3]\n"
      "ldr q15, [x24, #0xd0]\n"
      "fmla v26.4s, v16.4s, v0.s[3]\n"
      "fmla v30.4s, v16.4s, v1.s[3]\n"
      "ldr q16, [x24, #0xe0]\n"
      "fmla v27.4s, v17.4s, v0.s[3]\n"
      "ldr q0, [x21, #0x0]\n"
      "fmla v31.4s, v17.4s, v1.s[3]\n"
      "ldr q1, [x20, #0x0]\n"
      "ldr q17, [x24, #0xf0]\n"
      "bge 5b\n"
      "6:" // Height 2: Multiply loop: Single iteration only
      "fmla v24.4s, v2.4s, v0.s[0]\n"
      "fmla v28.4s, v2.4s, v1.s[0]\n"
      "add x21, x21, #0x10\n"
      "add x20, x20, #0x10\n"
      "fmla v25.4s, v3.4s, v0.s[0]\n"
      "fmla v29.4s, v3.4s, v1.s[0]\n"
      "sub x22, x22, #0x4\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "fmla v26.4s, v4.4s, v0.s[0]\n"
      "fmla v30.4s, v4.4s, v1.s[0]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "add x24, x24, #0x100\n"
      "fmla v27.4s, v5.4s, v0.s[0]\n"
      "fmla v31.4s, v5.4s, v1.s[0]\n"
      "fmla v24.4s, v6.4s, v0.s[1]\n"
      "fmla v28.4s, v6.4s, v1.s[1]\n"
      "fmla v25.4s, v7.4s, v0.s[1]\n"
      "fmla v29.4s, v7.4s, v1.s[1]\n"
      "fmla v26.4s, v8.4s, v0.s[1]\n"
      "fmla v30.4s, v8.4s, v1.s[1]\n"
      "fmla v27.4s, v9.4s, v0.s[1]\n"
      "fmla v31.4s, v9.4s, v1.s[1]\n"
      "fmla v24.4s, v10.4s, v0.s[2]\n"
      "fmla v28.4s, v10.4s, v1.s[2]\n"
      "fmla v25.4s, v11.4s, v0.s[2]\n"
      "fmla v29.4s, v11.4s, v1.s[2]\n"
      "fmla v26.4s, v12.4s, v0.s[2]\n"
      "fmla v30.4s, v12.4s, v1.s[2]\n"
      "fmla v27.4s, v13.4s, v0.s[2]\n"
      "fmla v31.4s, v13.4s, v1.s[2]\n"
      "fmla v24.4s, v14.4s, v0.s[3]\n"
      "fmla v28.4s, v14.4s, v1.s[3]\n"
      "fmla v25.4s, v15.4s, v0.s[3]\n"
      "fmla v29.4s, v15.4s, v1.s[3]\n"
      "fmla v26.4s, v16.4s, v0.s[3]\n"
      "fmla v30.4s, v16.4s, v1.s[3]\n"
      "fmla v27.4s, v17.4s, v0.s[3]\n"
      "fmla v31.4s, v17.4s, v1.s[3]\n"
      "7:" // Height 2: Multiply loop: Main loop skip
      "cbz x22, 9f\n"
      "8:" // Height 2: Multiply loop: Odd block loop
      "ldr s0, [x21], #0x4\n"
      "ldr s1, [x20], #0x4\n"
      "sub x22, x22, #0x1\n"
      "ldr q18, [x24, #0x0]\n"
      "ldr q19, [x24, #0x10]\n"
      "ldr q20, [x24, #0x20]\n"
      "ldr q21, [x24, #0x30]\n"
      "add x24, x24, #0x40\n"
      "fmla v24.4s, v18.4s, v0.s[0]\n"
      "fmla v28.4s, v18.4s, v1.s[0]\n"
      "fmla v25.4s, v19.4s, v0.s[0]\n"
      "fmla v29.4s, v19.4s, v1.s[0]\n"
      "fmla v26.4s, v20.4s, v0.s[0]\n"
      "fmla v30.4s, v20.4s, v1.s[0]\n"
      "fmla v27.4s, v21.4s, v0.s[0]\n"
      "fmla v31.4s, v21.4s, v1.s[0]\n"
      "cbnz x22, 8b\n"
      "9:" // Height 2: Multiply loop: No odd multiplies
      "ldr x20, [%x[gp], %[offsetof_ldc]]\n"
      "prfm pstl1keep, [x23, #0x0]\n"
      "str q24, [x23, #0x0]\n"
      "str q25, [x23, #0x10]\n"
      "str q26, [x23, #0x20]\n"
      "add x20, x23, x20\n"
      "prfm pstl1keep, [x20, #0x0]\n"
      "str q27, [x23, #0x30]\n"
      "add x23, x23, #0x40\n"
      "str q28, [x20, #0x0]\n"
      "str q29, [x20, #0x10]\n"
      "str q30, [x20, #0x20]\n"
      "str q31, [x20, #0x30]\n"
      "subs x25, x25, #0x1\n"
      "bgt 1b\n"
      :
      : [gp] "r"(gp),
        [offsetof_A] "I"(offsetof(GemmParamsFP32, A)),
        [offsetof_B] "I"(offsetof(GemmParamsFP32, B)),
        [offsetof_C] "I"(offsetof(GemmParamsFP32, C)),
        [offsetof_b_block_cols] "I"(offsetof(GemmParamsFP32, b_block_cols)),
        [offsetof_beta] "I"(offsetof(GemmParamsFP32, beta)),
        [offsetof_k] "I"(offsetof(GemmParamsFP32, k)),
        [offsetof_lda] "I"(offsetof(GemmParamsFP32, lda)),
        [offsetof_ldc] "I"(offsetof(GemmParamsFP32, ldc))
      : "cc",
        "memory",
        "v0",
        "v1",
        "v10",
        "v11",
        "v12",
        "v13",
        "v14",
        "v15",
        "v16",
        "v17",
        "v18",
        "v19",
        "v2",
        "v20",
        "v21",
        "v24",
        "v25",
        "v26",
        "v27",
        "v28",
        "v29",
        "v3",
        "v30",
        "v31",
        "v4",
        "v5",
        "v6",
        "v7",
        "v8",
        "v9",
        "x20",
        "x21",
        "x22",
        "x23",
        "x24",
        "x25",
        "x26");
#endif // __aarch64__
}

void NOINLINE gemmkernel_3x2_Neon_fp32_fA0fB0fC0(GemmParamsFP32* gp) {
#ifdef __aarch64__
  __asm__ __volatile__(
      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
      "mov x27, #0x1\n"
      "ldr x26, [%x[gp], %[offsetof_b_block_cols]]\n"
      "ldr x25, [%x[gp], %[offsetof_B]]\n"
      "ldr x24, [%x[gp], %[offsetof_C]]\n"
      "fcmp s16, #0.0\n"
      "csel x27, XZR, x27, EQ\n"
      "csel x27, XZR, x27, VS\n"
      "1:" // Height 3: Column loop
      "tbz x27, #0, 2f\n"
      "ldr q20, [x24, #0x0]\n"
      "ldr q21, [x24, #0x10]\n"
      "add x20, %x[gp], %[offsetof_beta]\n"
      "ldr q22, [x24, #0x20]\n"
      "ldr q23, [x24, #0x30]\n"
      "ld1r { v16.4s }, [x20]\n"
      "ldr x21, [%x[gp], %[offsetof_ldc]]\n"
      "add x20, x24, x21\n"
      "ldr q24, [x20, #0x0]\n"
      "ldr q25, [x20, #0x10]\n"
      "ldr q26, [x20, #0x20]\n"
      "ldr q27, [x20, #0x30]\n"
      "add x20, x20, x21\n"
      "ldr q28, [x20, #0x0]\n"
      "ldr q29, [x20, #0x10]\n"
      "fmul v20.4s, v20.4s, v16.4s\n"
      "fmul v21.4s, v21.4s, v16.4s\n"
      "ldr q30, [x20, #0x20]\n"
      "ldr q31, [x20, #0x30]\n"
      "fmul v22.4s, v22.4s, v16.4s\n"
      "fmul v23.4s, v23.4s, v16.4s\n"
      "fmul v24.4s, v24.4s, v16.4s\n"
      "fmul v25.4s, v25.4s, v16.4s\n"
      "fmul v26.4s, v26.4s, v16.4s\n"
      "fmul v27.4s, v27.4s, v16.4s\n"
      "fmul v28.4s, v28.4s, v16.4s\n"
      "fmul v29.4s, v29.4s, v16.4s\n"
      "fmul v30.4s, v30.4s, v16.4s\n"
      "fmul v31.4s, v31.4s, v16.4s\n"
      "b 3f\n"
      "2:" // Height 3: no accumulate
      "movi v20.16b, #0x0\n"
      "movi v21.16b, #0x0\n"
      "movi v22.16b, #0x0\n"
      "movi v23.16b, #0x0\n"
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "movi v30.16b, #0x0\n"
      "movi v31.16b, #0x0\n"
      "3:" // Height 3: setup done
      "ldr x21, [%x[gp], %[offsetof_A]]\n"
      "ldr x20, [%x[gp], %[offsetof_lda]]\n"
      "ldr x23, [%x[gp], %[offsetof_k]]\n"
      "mov x22, x21\n"
      "add x21, x22, x20\n"
      "add x20, x21, x20\n"
      "cmp x23, #0x4\n"
      "blt 7f\n"
      "ldr q0, [x22, #0x0]\n"
      "ldr q1, [x21, #0x0]\n"
      "cmp x23, #0x8\n"
      "ldr q2, [x20, #0x0]\n"
      "ldr q3, [x25, #0x0]\n"
      "ldr q4, [x25, #0x10]\n"
      "ldr q5, [x25, #0x20]\n"
      "ldr q6, [x25, #0x30]\n"
      "ldr q7, [x25, #0x40]\n"
      "ldr q8, [x25, #0x50]\n"
      "ldr q9, [x25, #0x60]\n"
      "ldr q10, [x25, #0x70]\n"
      "ldr q11, [x25, #0x80]\n"
      "ldr q12, [x25, #0x90]\n"
      "ldr q13, [x25, #0xa0]\n"
      "ldr q14, [x25, #0xb0]\n"
      "ldr q15, [x25, #0xc0]\n"
      "ldr q16, [x25, #0xd0]\n"
      "ldr q17, [x25, #0xe0]\n"
      "ldr q18, [x25, #0xf0]\n"
      "blt 6f\n"
      "5:" // Height 3: Multiply loop: Main loop head
      "fmla v20.4s, v3.4s, v0.s[0]\n"
      "fmla v24.4s, v3.4s, v1.s[0]\n"
      "sub x23, x23, #0x4\n"
      "add x22, x22, #0x10\n"
      "fmla v28.4s, v3.4s, v2.s[0]\n"
      "fmla v21.4s, v4.4s, v0.s[0]\n"
      "add x21, x21, #0x10\n"
      "add x20, x20, #0x10\n"
      "fmla v25.4s, v4.4s, v1.s[0]\n"
      "fmla v29.4s, v4.4s, v2.s[0]\n"
      "cmp x23, #0x8\n"
      "add x25, x25, #0x100\n"
      "ldr q3, [x25, #0x0]\n"
      "ldr q4, [x25, #0x10]\n"
      "fmla v22.4s, v5.4s, v0.s[0]\n"
      "fmla v26.4s, v5.4s, v1.s[0]\n"
      "fmla v30.4s, v5.4s, v2.s[0]\n"
      "ldr q5, [x25, #0x20]\n"
      "fmla v23.4s, v6.4s, v0.s[0]\n"
      "prfm pldl1keep, [x22, #0x80]\n"
      "fmla v27.4s, v6.4s, v1.s[0]\n"
      "fmla v31.4s, v6.4s, v2.s[0]\n"
      "ldr q6, [x25, #0x30]\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "fmla v20.4s, v7.4s, v0.s[1]\n"
      "fmla v24.4s, v7.4s, v1.s[1]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v28.4s, v7.4s, v2.s[1]\n"
      "ldr q7, [x25, #0x40]\n"
      "fmla v21.4s, v8.4s, v0.s[1]\n"
      "fmla v25.4s, v8.4s, v1.s[1]\n"
      "fmla v29.4s, v8.4s, v2.s[1]\n"
      "ldr q8, [x25, #0x50]\n"
      "fmla v22.4s, v9.4s, v0.s[1]\n"
      "fmla v26.4s, v9.4s, v1.s[1]\n"
      "fmla v30.4s, v9.4s, v2.s[1]\n"
      "ldr q9, [x25, #0x60]\n"
      "fmla v23.4s, v10.4s, v0.s[1]\n"
      "fmla v27.4s, v10.4s, v1.s[1]\n"
      "fmla v31.4s, v10.4s, v2.s[1]\n"
      "ldr q10, [x25, #0x70]\n"
      "fmla v20.4s, v11.4s, v0.s[2]\n"
      "fmla v24.4s, v11.4s, v1.s[2]\n"
      "fmla v28.4s, v11.4s, v2.s[2]\n"
      "ldr q11, [x25, #0x80]\n"
      "fmla v21.4s, v12.4s, v0.s[2]\n"
      "fmla v25.4s, v12.4s, v1.s[2]\n"
      "fmla v29.4s, v12.4s, v2.s[2]\n"
      "ldr q12, [x25, #0x90]\n"
      "fmla v22.4s, v13.4s, v0.s[2]\n"
      "fmla v26.4s, v13.4s, v1.s[2]\n"
      "fmla v30.4s, v13.4s, v2.s[2]\n"
      "ldr q13, [x25, #0xa0]\n"
      "fmla v23.4s, v14.4s, v0.s[2]\n"
      "fmla v27.4s, v14.4s, v1.s[2]\n"
      "fmla v31.4s, v14.4s, v2.s[2]\n"
      "ldr q14, [x25, #0xb0]\n"
      "fmla v20.4s, v15.4s, v0.s[3]\n"
      "fmla v24.4s, v15.4s, v1.s[3]\n"
      "fmla v28.4s, v15.4s, v2.s[3]\n"
      "ldr q15, [x25, #0xc0]\n"
      "fmla v21.4s, v16.4s, v0.s[3]\n"
      "fmla v25.4s, v16.4s, v1.s[3]\n"
      "fmla v29.4s, v16.4s, v2.s[3]\n"
      "ldr q16, [x25, #0xd0]\n"
      "fmla v22.4s, v17.4s, v0.s[3]\n"
      "fmla v26.4s, v17.4s, v1.s[3]\n"
      "fmla v30.4s, v17.4s, v2.s[3]\n"
      "ldr q17, [x25, #0xe0]\n"
      "fmla v23.4s, v18.4s, v0.s[3]\n"
      "ldr q0, [x22, #0x0]\n"
      "fmla v27.4s, v18.4s, v1.s[3]\n"
      "ldr q1, [x21, #0x0]\n"
      "fmla v31.4s, v18.4s, v2.s[3]\n"
      "ldr q2, [x20, #0x0]\n"
      "ldr q18, [x25, #0xf0]\n"
      "bge 5b\n"
      "6:" // Height 3: Multiply loop: Single iteration only
      "fmla v20.4s, v3.4s, v0.s[0]\n"
      "fmla v24.4s, v3.4s, v1.s[0]\n"
      "add x22, x22, #0x10\n"
      "add x21, x21, #0x10\n"
      "fmla v28.4s, v3.4s, v2.s[0]\n"
      "fmla v21.4s, v4.4s, v0.s[0]\n"
      "add x20, x20, #0x10\n"
      "sub x23, x23, #0x4\n"
      "fmla v25.4s, v4.4s, v1.s[0]\n"
      "fmla v29.4s, v4.4s, v2.s[0]\n"
      "prfm pldl1keep, [x22, #0x80]\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "fmla v22.4s, v5.4s, v0.s[0]\n"
      "fmla v26.4s, v5.4s, v1.s[0]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "add x25, x25, #0x100\n"
      "fmla v30.4s, v5.4s, v2.s[0]\n"
      "fmla v23.4s, v6.4s, v0.s[0]\n"
      "fmla v27.4s, v6.4s, v1.s[0]\n"
      "fmla v31.4s, v6.4s, v2.s[0]\n"
      "fmla v20.4s, v7.4s, v0.s[1]\n"
      "fmla v24.4s, v7.4s, v1.s[1]\n"
      "fmla v28.4s, v7.4s, v2.s[1]\n"
      "fmla v21.4s, v8.4s, v0.s[1]\n"
      "fmla v25.4s, v8.4s, v1.s[1]\n"
      "fmla v29.4s, v8.4s, v2.s[1]\n"
      "fmla v22.4s, v9.4s, v0.s[1]\n"
      "fmla v26.4s, v9.4s, v1.s[1]\n"
      "fmla v30.4s, v9.4s, v2.s[1]\n"
      "fmla v23.4s, v10.4s, v0.s[1]\n"
      "fmla v27.4s, v10.4s, v1.s[1]\n"
      "fmla v31.4s, v10.4s, v2.s[1]\n"
      "fmla v20.4s, v11.4s, v0.s[2]\n"
      "fmla v24.4s, v11.4s, v1.s[2]\n"
      "fmla v28.4s, v11.4s, v2.s[2]\n"
      "fmla v21.4s, v12.4s, v0.s[2]\n"
      "fmla v25.4s, v12.4s, v1.s[2]\n"
      "fmla v29.4s, v12.4s, v2.s[2]\n"
      "fmla v22.4s, v13.4s, v0.s[2]\n"
      "fmla v26.4s, v13.4s, v1.s[2]\n"
      "fmla v30.4s, v13.4s, v2.s[2]\n"
      "fmla v23.4s, v14.4s, v0.s[2]\n"
      "fmla v27.4s, v14.4s, v1.s[2]\n"
      "fmla v31.4s, v14.4s, v2.s[2]\n"
      "fmla v20.4s, v15.4s, v0.s[3]\n"
      "fmla v24.4s, v15.4s, v1.s[3]\n"
      "fmla v28.4s, v15.4s, v2.s[3]\n"
      "fmla v21.4s, v16.4s, v0.s[3]\n"
      "fmla v25.4s, v16.4s, v1.s[3]\n"
      "fmla v29.4s, v16.4s, v2.s[3]\n"
      "fmla v22.4s, v17.4s, v0.s[3]\n"
      "fmla v26.4s, v17.4s, v1.s[3]\n"
      "fmla v30.4s, v17.4s, v2.s[3]\n"
      "fmla v23.4s, v18.4s, v0.s[3]\n"
      "fmla v27.4s, v18.4s, v1.s[3]\n"
      "fmla v31.4s, v18.4s, v2.s[3]\n"
      "7:" // Height 3: Multiply loop: Main loop skip
      "cbz x23, 9f\n"
      "8:" // Height 3: Multiply loop: Odd block loop
      "ldr s0, [x22], #0x4\n"
      "ldr s1, [x21], #0x4\n"
      "sub x23, x23, #0x1\n"
      "ldr s2, [x20], #0x4\n"
      "ldr q19, [x25, #0x0]\n"
      "ldr q3, [x25, #0x10]\n"
      "ldr q4, [x25, #0x20]\n"
      "ldr q5, [x25, #0x30]\n"
      "add x25, x25, #0x40\n"
      "fmla v20.4s, v19.4s, v0.s[0]\n"
      "fmla v24.4s, v19.4s, v1.s[0]\n"
      "fmla v28.4s, v19.4s, v2.s[0]\n"
      "fmla v21.4s, v3.4s, v0.s[0]\n"
      "fmla v25.4s, v3.4s, v1.s[0]\n"
      "fmla v29.4s, v3.4s, v2.s[0]\n"
      "fmla v22.4s, v4.4s, v0.s[0]\n"
      "fmla v26.4s, v4.4s, v1.s[0]\n"
      "fmla v30.4s, v4.4s, v2.s[0]\n"
      "fmla v23.4s, v5.4s, v0.s[0]\n"
      "fmla v27.4s, v5.4s, v1.s[0]\n"
      "fmla v31.4s, v5.4s, v2.s[0]\n"
      "cbnz x23, 8b\n"
      "9:" // Height 3: Multiply loop: No odd multiplies
      "ldr x20, [%x[gp], %[offsetof_ldc]]\n"
      "prfm pstl1keep, [x24, #0x0]\n"
      "str q20, [x24, #0x0]\n"
      "str q21, [x24, #0x10]\n"
      "str q22, [x24, #0x20]\n"
      "add x21, x24, x20\n"
      "add x20, x21, x20\n"
      "prfm pstl1keep, [x21, #0x0]\n"
      "prfm pstl1keep, [x20, #0x0]\n"
      "str q23, [x24, #0x30]\n"
      "add x24, x24, #0x40\n"
      "str q24, [x21, #0x0]\n"
      "str q25, [x21, #0x10]\n"
      "str q26, [x21, #0x20]\n"
      "str q27, [x21, #0x30]\n"
      "str q28, [x20, #0x0]\n"
      "str q29, [x20, #0x10]\n"
      "str q30, [x20, #0x20]\n"
      "str q31, [x20, #0x30]\n"
      "subs x26, x26, #0x1\n"
      "bgt 1b\n"
      :
      : [gp] "r"(gp),
        [offsetof_A] "I"(offsetof(GemmParamsFP32, A)),
        [offsetof_B] "I"(offsetof(GemmParamsFP32, B)),
        [offsetof_C] "I"(offsetof(GemmParamsFP32, C)),
        [offsetof_b_block_cols] "I"(offsetof(GemmParamsFP32, b_block_cols)),
        [offsetof_beta] "I"(offsetof(GemmParamsFP32, beta)),
        [offsetof_k] "I"(offsetof(GemmParamsFP32, k)),
        [offsetof_lda] "I"(offsetof(GemmParamsFP32, lda)),
        [offsetof_ldc] "I"(offsetof(GemmParamsFP32, ldc))
      : "cc",
        "memory",
        "v0",
        "v1",
        "v10",
        "v11",
        "v12",
        "v13",
        "v14",
        "v15",
        "v16",
        "v17",
        "v18",
        "v19",
        "v2",
        "v20",
        "v21",
        "v22",
        "v23",
        "v24",
        "v25",
        "v26",
        "v27",
        "v28",
        "v29",
        "v3",
        "v30",
        "v31",
        "v4",
        "v5",
        "v6",
        "v7",
        "v8",
        "v9",
        "x20",
        "x21",
        "x22",
        "x23",
        "x24",
        "x25",
        "x26",
        "x27");
#endif // __aarch64__
}

void NOINLINE gemmkernel_4x2_Neon_fp32_fA0fB0fC0(GemmParamsFP32* gp) {
#ifdef __aarch64__
  __asm__ __volatile__(
      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
      "mov x28, #0x1\n"
      "ldr x27, [%x[gp], %[offsetof_b_block_cols]]\n"
      "ldr x26, [%x[gp], %[offsetof_B]]\n"
      "ldr x25, [%x[gp], %[offsetof_C]]\n"
      "fcmp s16, #0.0\n"
      "csel x28, XZR, x28, EQ\n"
      "csel x28, XZR, x28, VS\n"
      "1:" // Height 4: Column loop
      "tbz x28, #0, 2f\n"
      "ldr q16, [x25, #0x0]\n"
      "ldr q17, [x25, #0x10]\n"
      "add x20, %x[gp], %[offsetof_beta]\n"
      "ldr q18, [x25, #0x20]\n"
      "ldr q19, [x25, #0x30]\n"
      "ld1r { v0.4s }, [x20]\n"
      "ldr x22, [%x[gp], %[offsetof_ldc]]\n"
      "add x20, x25, x22\n"
      "ldr q20, [x20, #0x0]\n"
      "ldr q21, [x20, #0x10]\n"
      "ldr q22, [x20, #0x20]\n"
      "ldr q23, [x20, #0x30]\n"
      "add x21, x20, x22\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "add x20, x21, x22\n"
      "fmul v16.4s, v16.4s, v0.4s\n"
      "ldr q26, [x21, #0x20]\n"
      "ldr q27, [x21, #0x30]\n"
      "fmul v17.4s, v17.4s, v0.4s\n"
      "fmul v18.4s, v18.4s, v0.4s\n"
      "ldr q28, [x20, #0x0]\n"
      "ldr q29, [x20, #0x10]\n"
      "fmul v19.4s, v19.4s, v0.4s\n"
      "fmul v20.4s, v20.4s, v0.4s\n"
      "ldr q30, [x20, #0x20]\n"
      "ldr q31, [x20, #0x30]\n"
      "fmul v21.4s, v21.4s, v0.4s\n"
      "fmul v22.4s, v22.4s, v0.4s\n"
      "fmul v23.4s, v23.4s, v0.4s\n"
      "fmul v24.4s, v24.4s, v0.4s\n"
      "fmul v25.4s, v25.4s, v0.4s\n"
      "fmul v26.4s, v26.4s, v0.4s\n"
      "fmul v27.4s, v27.4s, v0.4s\n"
      "fmul v28.4s, v28.4s, v0.4s\n"
      "fmul v29.4s, v29.4s, v0.4s\n"
      "fmul v30.4s, v30.4s, v0.4s\n"
      "fmul v31.4s, v31.4s, v0.4s\n"
      "b 3f\n"
      "2:" // Height 4: no accumulate
      "movi v16.16b, #0x0\n"
      "movi v17.16b, #0x0\n"
      "movi v18.16b, #0x0\n"
      "movi v19.16b, #0x0\n"
      "movi v20.16b, #0x0\n"
      "movi v21.16b, #0x0\n"
      "movi v22.16b, #0x0\n"
      "movi v23.16b, #0x0\n"
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "movi v30.16b, #0x0\n"
      "movi v31.16b, #0x0\n"
      "3:" // Height 4: setup done
      "ldr x21, [%x[gp], %[offsetof_A]]\n"
      "ldr x20, [%x[gp], %[offsetof_lda]]\n"
      "ldr x24, [%x[gp], %[offsetof_k]]\n"
      "mov x23, x21\n"
      "add x22, x23, x20\n"
      "add x21, x22, x20\n"
      "add x20, x21, x20\n"
      "cmp x24, #0x4\n"
      "blt 7f\n"
      "ldr q0, [x23, #0x0]\n"
      "ldr q1, [x22, #0x0]\n"
      "cmp x24, #0x8\n"
      "ldr q2, [x21, #0x0]\n"
      "ldr q3, [x20, #0x0]\n"
      "ldr q4, [x26, #0x0]\n"
      "ldr q5, [x26, #0x10]\n"
      "ldr q6, [x26, #0x20]\n"
      "ldr q7, [x26, #0x30]\n"
      "ldr q8, [x26, #0x40]\n"
      "ldr q9, [x26, #0x50]\n"
      "ldr q10, [x26, #0x60]\n"
      "ldr q11, [x26, #0x70]\n"
      "ldr q12, [x26, #0x80]\n"
      "ldr q13, [x26, #0x90]\n"
      "ldr q14, [x26, #0xa0]\n"
      "ldr q15, [x26, #0xb0]\n"
      "blt 6f\n"
      "5:" // Height 4: Multiply loop: Main loop head
      "fmla v16.4s, v4.4s, v0.s[0]\n"
      "fmla v20.4s, v4.4s, v1.s[0]\n"
      "sub x24, x24, #0x4\n"
      "add x23, x23, #0x10\n"
      "fmla v24.4s, v4.4s, v2.s[0]\n"
      "fmla v28.4s, v4.4s, v3.s[0]\n"
      "ldr q4, [x26, #0xc0]\n"
      "add x22, x22, #0x10\n"
      "fmla v17.4s, v5.4s, v0.s[0]\n"
      "fmla v21.4s, v5.4s, v1.s[0]\n"
      "add x21, x21, #0x10\n"
      "add x20, x20, #0x10\n"
      "fmla v25.4s, v5.4s, v2.s[0]\n"
      "fmla v29.4s, v5.4s, v3.s[0]\n"
      "ldr q5, [x26, #0xd0]\n"
      "cmp x24, #0x8\n"
      "fmla v18.4s, v6.4s, v0.s[0]\n"
      "fmla v22.4s, v6.4s, v1.s[0]\n"
      "prfm pldl1keep, [x23, #0x80]\n"
      "prfm pldl1keep, [x22, #0x80]\n"
      "fmla v26.4s, v6.4s, v2.s[0]\n"
      "fmla v30.4s, v6.4s, v3.s[0]\n"
      "ldr q6, [x26, #0xe0]\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "fmla v19.4s, v7.4s, v0.s[0]\n"
      "fmla v23.4s, v7.4s, v1.s[0]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v27.4s, v7.4s, v2.s[0]\n"
      "fmla v31.4s, v7.4s, v3.s[0]\n"
      "ldr q7, [x26, #0xf0]\n"
      "add x26, x26, #0x100\n"
      "fmla v16.4s, v8.4s, v0.s[1]\n"
      "fmla v20.4s, v8.4s, v1.s[1]\n"
      "fmla v24.4s, v8.4s, v2.s[1]\n"
      "fmla v28.4s, v8.4s, v3.s[1]\n"
      "ldr q8, [x26, #0x40]\n"
      "fmla v17.4s, v9.4s, v0.s[1]\n"
      "fmla v21.4s, v9.4s, v1.s[1]\n"
      "fmla v25.4s, v9.4s, v2.s[1]\n"
      "fmla v29.4s, v9.4s, v3.s[1]\n"
      "ldr q9, [x26, #0x50]\n"
      "fmla v18.4s, v10.4s, v0.s[1]\n"
      "fmla v22.4s, v10.4s, v1.s[1]\n"
      "fmla v26.4s, v10.4s, v2.s[1]\n"
      "fmla v30.4s, v10.4s, v3.s[1]\n"
      "ldr q10, [x26, #0x60]\n"
      "fmla v19.4s, v11.4s, v0.s[1]\n"
      "fmla v23.4s, v11.4s, v1.s[1]\n"
      "fmla v27.4s, v11.4s, v2.s[1]\n"
      "fmla v31.4s, v11.4s, v3.s[1]\n"
      "ldr q11, [x26, #0x70]\n"
      "fmla v16.4s, v12.4s, v0.s[2]\n"
      "fmla v20.4s, v12.4s, v1.s[2]\n"
      "fmla v24.4s, v12.4s, v2.s[2]\n"
      "fmla v28.4s, v12.4s, v3.s[2]\n"
      "ldr q12, [x26, #0x80]\n"
      "fmla v17.4s, v13.4s, v0.s[2]\n"
      "fmla v21.4s, v13.4s, v1.s[2]\n"
      "fmla v25.4s, v13.4s, v2.s[2]\n"
      "fmla v29.4s, v13.4s, v3.s[2]\n"
      "ldr q13, [x26, #0x90]\n"
      "fmla v18.4s, v14.4s, v0.s[2]\n"
      "fmla v22.4s, v14.4s, v1.s[2]\n"
      "fmla v26.4s, v14.4s, v2.s[2]\n"
      "fmla v30.4s, v14.4s, v3.s[2]\n"
      "ldr q14, [x26, #0xa0]\n"
      "fmla v19.4s, v15.4s, v0.s[2]\n"
      "fmla v23.4s, v15.4s, v1.s[2]\n"
      "fmla v27.4s, v15.4s, v2.s[2]\n"
      "fmla v31.4s, v15.4s, v3.s[2]\n"
      "ldr q15, [x26, #0xb0]\n"
      "fmla v16.4s, v4.4s, v0.s[3]\n"
      "fmla v20.4s, v4.4s, v1.s[3]\n"
      "fmla v24.4s, v4.4s, v2.s[3]\n"
      "fmla v28.4s, v4.4s, v3.s[3]\n"
      "ldr q4, [x26, #0x0]\n"
      "fmla v17.4s, v5.4s, v0.s[3]\n"
      "fmla v21.4s, v5.4s, v1.s[3]\n"
      "fmla v25.4s, v5.4s, v2.s[3]\n"
      "fmla v29.4s, v5.4s, v3.s[3]\n"
      "ldr q5, [x26, #0x10]\n"
      "fmla v18.4s, v6.4s, v0.s[3]\n"
      "fmla v22.4s, v6.4s, v1.s[3]\n"
      "fmla v26.4s, v6.4s, v2.s[3]\n"
      "fmla v30.4s, v6.4s, v3.s[3]\n"
      "ldr q6, [x26, #0x20]\n"
      "fmla v19.4s, v7.4s, v0.s[3]\n"
      "ldr q0, [x23, #0x0]\n"
      "fmla v23.4s, v7.4s, v1.s[3]\n"
      "ldr q1, [x22, #0x0]\n"
      "fmla v27.4s, v7.4s, v2.s[3]\n"
      "ldr q2, [x21, #0x0]\n"
      "fmla v31.4s, v7.4s, v3.s[3]\n"
      "ldr q3, [x20, #0x0]\n"
      "ldr q7, [x26, #0x30]\n"
      "bge 5b\n"
      "6:" // Height 4: Multiply loop: Single iteration only
      "fmla v16.4s, v4.4s, v0.s[0]\n"
      "fmla v20.4s, v4.4s, v1.s[0]\n"
      "add x23, x23, #0x10\n"
      "add x22, x22, #0x10\n"
      "fmla v24.4s, v4.4s, v2.s[0]\n"
      "fmla v28.4s, v4.4s, v3.s[0]\n"
      "ldr q4, [x26, #0xc0]\n"
      "add x21, x21, #0x10\n"
      "fmla v17.4s, v5.4s, v0.s[0]\n"
      "fmla v21.4s, v5.4s, v1.s[0]\n"
      "add x20, x20, #0x10\n"
      "sub x24, x24, #0x4\n"
      "fmla v25.4s, v5.4s, v2.s[0]\n"
      "fmla v29.4s, v5.4s, v3.s[0]\n"
      "ldr q5, [x26, #0xd0]\n"
      "prfm pldl1keep, [x23, #0x80]\n"
      "fmla v18.4s, v6.4s, v0.s[0]\n"
      "fmla v22.4s, v6.4s, v1.s[0]\n"
      "prfm pldl1keep, [x22, #0x80]\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "fmla v26.4s, v6.4s, v2.s[0]\n"
      "fmla v30.4s, v6.4s, v3.s[0]\n"
      "ldr q6, [x26, #0xe0]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v19.4s, v7.4s, v0.s[0]\n"
      "fmla v23.4s, v7.4s, v1.s[0]\n"
      "fmla v27.4s, v7.4s, v2.s[0]\n"
      "fmla v31.4s, v7.4s, v3.s[0]\n"
      "ldr q7, [x26, #0xf0]\n"
      "add x26, x26, #0x100\n"
      "fmla v16.4s, v8.4s, v0.s[1]\n"
      "fmla v20.4s, v8.4s, v1.s[1]\n"
      "fmla v24.4s, v8.4s, v2.s[1]\n"
      "fmla v28.4s, v8.4s, v3.s[1]\n"
      "fmla v17.4s, v9.4s, v0.s[1]\n"
      "fmla v21.4s, v9.4s, v1.s[1]\n"
      "fmla v25.4s, v9.4s, v2.s[1]\n"
      "fmla v29.4s, v9.4s, v3.s[1]\n"
      "fmla v18.4s, v10.4s, v0.s[1]\n"
      "fmla v22.4s, v10.4s, v1.s[1]\n"
      "fmla v26.4s, v10.4s, v2.s[1]\n"
      "fmla v30.4s, v10.4s, v3.s[1]\n"
      "fmla v19.4s, v11.4s, v0.s[1]\n"
      "fmla v23.4s, v11.4s, v1.s[1]\n"
      "fmla v27.4s, v11.4s, v2.s[1]\n"
      "fmla v31.4s, v11.4s, v3.s[1]\n"
      "fmla v16.4s, v12.4s, v0.s[2]\n"
      "fmla v20.4s, v12.4s, v1.s[2]\n"
      "fmla v24.4s, v12.4s, v2.s[2]\n"
      "fmla v28.4s, v12.4s, v3.s[2]\n"
      "fmla v17.4s, v13.4s, v0.s[2]\n"
      "fmla v21.4s, v13.4s, v1.s[2]\n"
      "fmla v25.4s, v13.4s, v2.s[2]\n"
      "fmla v29.4s, v13.4s, v3.s[2]\n"
      "fmla v18.4s, v14.4s, v0.s[2]\n"
      "fmla v22.4s, v14.4s, v1.s[2]\n"
      "fmla v26.4s, v14.4s, v2.s[2]\n"
      "fmla v30.4s, v14.4s, v3.s[2]\n"
      "fmla v19.4s, v15.4s, v0.s[2]\n"
      "fmla v23.4s, v15.4s, v1.s[2]\n"
      "fmla v27.4s, v15.4s, v2.s[2]\n"
      "fmla v31.4s, v15.4s, v3.s[2]\n"
      "fmla v16.4s, v4.4s, v0.s[3]\n"
      "fmla v20.4s, v4.4s, v1.s[3]\n"
      "fmla v24.4s, v4.4s, v2.s[3]\n"
      "fmla v28.4s, v4.4s, v3.s[3]\n"
      "fmla v17.4s, v5.4s, v0.s[3]\n"
      "fmla v21.4s, v5.4s, v1.s[3]\n"
      "fmla v25.4s, v5.4s, v2.s[3]\n"
      "fmla v29.4s, v5.4s, v3.s[3]\n"
      "fmla v18.4s, v6.4s, v0.s[3]\n"
      "fmla v22.4s, v6.4s, v1.s[3]\n"
      "fmla v26.4s, v6.4s, v2.s[3]\n"
      "fmla v30.4s, v6.4s, v3.s[3]\n"
      "fmla v19.4s, v7.4s, v0.s[3]\n"
      "fmla v23.4s, v7.4s, v1.s[3]\n"
      "fmla v27.4s, v7.4s, v2.s[3]\n"
      "fmla v31.4s, v7.4s, v3.s[3]\n"
      "7:" // Height 4: Multiply loop: Main loop skip
      "cbz x24, 9f\n"
      "8:" // Height 4: Multiply loop: Odd block loop
      "ldr s0, [x23], #0x4\n"
      "ldr s1, [x22], #0x4\n"
      "sub x24, x24, #0x1\n"
      "ldr s2, [x21], #0x4\n"
      "ldr s3, [x20], #0x4\n"
      "ldr q8, [x26, #0x0]\n"
      "ldr q9, [x26, #0x10]\n"
      "ldr q10, [x26, #0x20]\n"
      "ldr q11, [x26, #0x30]\n"
      "add x26, x26, #0x40\n"
      "fmla v16.4s, v8.4s, v0.s[0]\n"
      "fmla v20.4s, v8.4s, v1.s[0]\n"
      "fmla v24.4s, v8.4s, v2.s[0]\n"
      "fmla v28.4s, v8.4s, v3.s[0]\n"
      "fmla v17.4s, v9.4s, v0.s[0]\n"
      "fmla v21.4s, v9.4s, v1.s[0]\n"
      "fmla v25.4s, v9.4s, v2.s[0]\n"
      "fmla v29.4s, v9.4s, v3.s[0]\n"
      "fmla v18.4s, v10.4s, v0.s[0]\n"
      "fmla v22.4s, v10.4s, v1.s[0]\n"
      "fmla v26.4s, v10.4s, v2.s[0]\n"
      "fmla v30.4s, v10.4s, v3.s[0]\n"
      "fmla v19.4s, v11.4s, v0.s[0]\n"
      "fmla v23.4s, v11.4s, v1.s[0]\n"
      "fmla v27.4s, v11.4s, v2.s[0]\n"
      "fmla v31.4s, v11.4s, v3.s[0]\n"
      "cbnz x24, 8b\n"
      "9:" // Height 4: Multiply loop: No odd multiplies
      "ldr x20, [%x[gp], %[offsetof_ldc]]\n"
      "prfm pstl1keep, [x25, #0x0]\n"
      "str q16, [x25, #0x0]\n"
      "str q17, [x25, #0x10]\n"
      "str q18, [x25, #0x20]\n"
      "add x22, x25, x20\n"
      "add x21, x22, x20\n"
      "add x20, x21, x20\n"
      "prfm pstl1keep, [x22, #0x0]\n"
      "prfm pstl1keep, [x21, #0x0]\n"
      "str q19, [x25, #0x30]\n"
      "prfm pstl1keep, [x20, #0x0]\n"
      "str q20, [x22, #0x0]\n"
      "add x25, x25, #0x40\n"
      "str q21, [x22, #0x10]\n"
      "str q22, [x22, #0x20]\n"
      "str q23, [x22, #0x30]\n"
      "str q24, [x21, #0x0]\n"
      "str q25, [x21, #0x10]\n"
      "str q26, [x21, #0x20]\n"
      "str q27, [x21, #0x30]\n"
      "str q28, [x20, #0x0]\n"
      "str q29, [x20, #0x10]\n"
      "str q30, [x20, #0x20]\n"
      "str q31, [x20, #0x30]\n"
      "subs x27, x27, #0x1\n"
      "bgt 1b\n"
      :
      : [gp] "r"(gp),
        [offsetof_A] "I"(offsetof(GemmParamsFP32, A)),
        [offsetof_B] "I"(offsetof(GemmParamsFP32, B)),
        [offsetof_C] "I"(offsetof(GemmParamsFP32, C)),
        [offsetof_b_block_cols] "I"(offsetof(GemmParamsFP32, b_block_cols)),
        [offsetof_beta] "I"(offsetof(GemmParamsFP32, beta)),
        [offsetof_k] "I"(offsetof(GemmParamsFP32, k)),
        [offsetof_lda] "I"(offsetof(GemmParamsFP32, lda)),
        [offsetof_ldc] "I"(offsetof(GemmParamsFP32, ldc))
      : "cc",
        "memory",
        "v0",
        "v1",
        "v10",
        "v11",
        "v12",
        "v13",
        "v14",
        "v15",
        "v16",
        "v17",
        "v18",
        "v19",
        "v2",
        "v20",
        "v21",
        "v22",
        "v23",
        "v24",
        "v25",
        "v26",
        "v27",
        "v28",
        "v29",
        "v3",
        "v30",
        "v31",
        "v4",
        "v5",
        "v6",
        "v7",
        "v8",
        "v9",
        "x20",
        "x21",
        "x22",
        "x23",
        "x24",
        "x25",
        "x26",
        "x27",
        "x28");
#endif // __aarch64__
}

void NOINLINE gemmkernel_5x2_Neon_fp32_fA0fB0fC0(GemmParamsFP32* gp) {
#ifdef __aarch64__
  __asm__ __volatile__(
      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
      "mov x9, #0x1\n"
      "ldr x28, [%x[gp], %[offsetof_b_block_cols]]\n"
      "ldr x27, [%x[gp], %[offsetof_B]]\n"
      "ldr x26, [%x[gp], %[offsetof_C]]\n"
      "fcmp s16, #0.0\n"
      "csel x9, XZR, x9, EQ\n"
      "csel x9, XZR, x9, VS\n"
      "1:" // Height 5: Column loop
      "tbz x9, #0, 2f\n"
      "ldr q12, [x26, #0x0]\n"
      "ldr q13, [x26, #0x10]\n"
      "add x20, %x[gp], %[offsetof_beta]\n"
      "ldr q14, [x26, #0x20]\n"
      "ldr q15, [x26, #0x30]\n"
      "ld1r { v0.4s }, [x20]\n"
      "ldr x22, [%x[gp], %[offsetof_ldc]]\n"
      "add x20, x26, x22\n"
      "ldr q16, [x20, #0x0]\n"
      "ldr q17, [x20, #0x10]\n"
      "ldr q18, [x20, #0x20]\n"
      "ldr q19, [x20, #0x30]\n"
      "add x20, x20, x22\n"
      "ldr q20, [x20, #0x0]\n"
      "ldr q21, [x20, #0x10]\n"
      "add x21, x20, x22\n"
      "fmul v12.4s, v12.4s, v0.4s\n"
      "ldr q22, [x20, #0x20]\n"
      "ldr q23, [x20, #0x30]\n"
      "add x20, x21, x22\n"
      "fmul v13.4s, v13.4s, v0.4s\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "fmul v14.4s, v14.4s, v0.4s\n"
      "fmul v15.4s, v15.4s, v0.4s\n"
      "ldr q26, [x21, #0x20]\n"
      "ldr q27, [x21, #0x30]\n"
      "fmul v16.4s, v16.4s, v0.4s\n"
      "fmul v17.4s, v17.4s, v0.4s\n"
      "ldr q28, [x20, #0x0]\n"
      "ldr q29, [x20, #0x10]\n"
      "fmul v18.4s, v18.4s, v0.4s\n"
      "fmul v19.4s, v19.4s, v0.4s\n"
      "ldr q30, [x20, #0x20]\n"
      "ldr q31, [x20, #0x30]\n"
      "fmul v20.4s, v20.4s, v0.4s\n"
      "fmul v21.4s, v21.4s, v0.4s\n"
      "fmul v22.4s, v22.4s, v0.4s\n"
      "fmul v23.4s, v23.4s, v0.4s\n"
      "fmul v24.4s, v24.4s, v0.4s\n"
      "fmul v25.4s, v25.4s, v0.4s\n"
      "fmul v26.4s, v26.4s, v0.4s\n"
      "fmul v27.4s, v27.4s, v0.4s\n"
      "fmul v28.4s, v28.4s, v0.4s\n"
      "fmul v29.4s, v29.4s, v0.4s\n"
      "fmul v30.4s, v30.4s, v0.4s\n"
      "fmul v31.4s, v31.4s, v0.4s\n"
      "b 3f\n"
      "2:" // Height 5: no accumulate
      "movi v12.16b, #0x0\n"
      "movi v13.16b, #0x0\n"
      "movi v14.16b, #0x0\n"
      "movi v15.16b, #0x0\n"
      "movi v16.16b, #0x0\n"
      "movi v17.16b, #0x0\n"
      "movi v18.16b, #0x0\n"
      "movi v19.16b, #0x0\n"
      "movi v20.16b, #0x0\n"
      "movi v21.16b, #0x0\n"
      "movi v22.16b, #0x0\n"
      "movi v23.16b, #0x0\n"
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "movi v30.16b, #0x0\n"
      "movi v31.16b, #0x0\n"
      "3:" // Height 5: setup done
      "ldr x21, [%x[gp], %[offsetof_A]]\n"
      "ldr x20, [%x[gp], %[offsetof_lda]]\n"
      "ldr x25, [%x[gp], %[offsetof_k]]\n"
      "mov x24, x21\n"
      "add x23, x24, x20\n"
      "add x22, x23, x20\n"
      "add x21, x22, x20\n"
      "add x20, x21, x20\n"
      "cmp x25, #0x4\n"
      "blt 7f\n"
      "ldr q0, [x24, #0x0]\n"
      "ldr q1, [x23, #0x0]\n"
      "cmp x25, #0x8\n"
      "ldr q2, [x22, #0x0]\n"
      "ldr q3, [x21, #0x0]\n"
      "ldr q4, [x20, #0x0]\n"
      "ldr q5, [x27, #0x0]\n"
      "ldr q6, [x27, #0x10]\n"
      "ldr q7, [x27, #0x20]\n"
      "ldr q8, [x27, #0x30]\n"
      "ldr q9, [x27, #0x40]\n"
      "ldr q10, [x27, #0x50]\n"
      "ldr q11, [x27, #0x60]\n"
      "blt 6f\n"
      "5:" // Height 5: Multiply loop: Main loop head
      "fmla v12.4s, v5.4s, v0.s[0]\n"
      "fmla v16.4s, v5.4s, v1.s[0]\n"
      "sub x25, x25, #0x4\n"
      "add x24, x24, #0x10\n"
      "fmla v20.4s, v5.4s, v2.s[0]\n"
      "fmla v24.4s, v5.4s, v3.s[0]\n"
      "add x23, x23, #0x10\n"
      "add x22, x22, #0x10\n"
      "fmla v28.4s, v5.4s, v4.s[0]\n"
      "ldr q5, [x27, #0x70]\n"
      "fmla v13.4s, v6.4s, v0.s[0]\n"
      "add x21, x21, #0x10\n"
      "fmla v17.4s, v6.4s, v1.s[0]\n"
      "fmla v21.4s, v6.4s, v2.s[0]\n"
      "add x20, x20, #0x10\n"
      "cmp x25, #0x8\n"
      "fmla v25.4s, v6.4s, v3.s[0]\n"
      "fmla v29.4s, v6.4s, v4.s[0]\n"
      "ldr q6, [x27, #0x80]\n"
      "prfm pldl1keep, [x24, #0x80]\n"
      "fmla v14.4s, v7.4s, v0.s[0]\n"
      "fmla v18.4s, v7.4s, v1.s[0]\n"
      "prfm pldl1keep, [x23, #0x80]\n"
      "prfm pldl1keep, [x22, #0x80]\n"
      "fmla v22.4s, v7.4s, v2.s[0]\n"
      "fmla v26.4s, v7.4s, v3.s[0]\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v30.4s, v7.4s, v4.s[0]\n"
      "ldr q7, [x27, #0x90]\n"
      "fmla v15.4s, v8.4s, v0.s[0]\n"
      "fmla v19.4s, v8.4s, v1.s[0]\n"
      "fmla v23.4s, v8.4s, v2.s[0]\n"
      "fmla v27.4s, v8.4s, v3.s[0]\n"
      "fmla v31.4s, v8.4s, v4.s[0]\n"
      "ldr q8, [x27, #0xa0]\n"
      "fmla v12.4s, v9.4s, v0.s[1]\n"
      "fmla v16.4s, v9.4s, v1.s[1]\n"
      "fmla v20.4s, v9.4s, v2.s[1]\n"
      "fmla v24.4s, v9.4s, v3.s[1]\n"
      "fmla v28.4s, v9.4s, v4.s[1]\n"
      "ldr q9, [x27, #0xb0]\n"
      "fmla v13.4s, v10.4s, v0.s[1]\n"
      "fmla v17.4s, v10.4s, v1.s[1]\n"
      "fmla v21.4s, v10.4s, v2.s[1]\n"
      "fmla v25.4s, v10.4s, v3.s[1]\n"
      "fmla v29.4s, v10.4s, v4.s[1]\n"
      "ldr q10, [x27, #0xc0]\n"
      "fmla v14.4s, v11.4s, v0.s[1]\n"
      "fmla v18.4s, v11.4s, v1.s[1]\n"
      "fmla v22.4s, v11.4s, v2.s[1]\n"
      "fmla v26.4s, v11.4s, v3.s[1]\n"
      "fmla v30.4s, v11.4s, v4.s[1]\n"
      "ldr q11, [x27, #0xd0]\n"
      "fmla v15.4s, v5.4s, v0.s[1]\n"
      "fmla v19.4s, v5.4s, v1.s[1]\n"
      "fmla v23.4s, v5.4s, v2.s[1]\n"
      "fmla v27.4s, v5.4s, v3.s[1]\n"
      "fmla v31.4s, v5.4s, v4.s[1]\n"
      "ldr q5, [x27, #0xe0]\n"
      "fmla v12.4s, v6.4s, v0.s[2]\n"
      "fmla v16.4s, v6.4s, v1.s[2]\n"
      "fmla v20.4s, v6.4s, v2.s[2]\n"
      "fmla v24.4s, v6.4s, v3.s[2]\n"
      "fmla v28.4s, v6.4s, v4.s[2]\n"
      "ldr q6, [x27, #0xf0]\n"
      "fmla v13.4s, v7.4s, v0.s[2]\n"
      "add x27, x27, #0x100\n"
      "fmla v17.4s, v7.4s, v1.s[2]\n"
      "fmla v21.4s, v7.4s, v2.s[2]\n"
      "fmla v25.4s, v7.4s, v3.s[2]\n"
      "fmla v29.4s, v7.4s, v4.s[2]\n"
      "ldr q7, [x27, #0x20]\n"
      "fmla v14.4s, v8.4s, v0.s[2]\n"
      "fmla v18.4s, v8.4s, v1.s[2]\n"
      "fmla v22.4s, v8.4s, v2.s[2]\n"
      "fmla v26.4s, v8.4s, v3.s[2]\n"
      "fmla v30.4s, v8.4s, v4.s[2]\n"
      "ldr q8, [x27, #0x30]\n"
      "fmla v15.4s, v9.4s, v0.s[2]\n"
      "fmla v19.4s, v9.4s, v1.s[2]\n"
      "fmla v23.4s, v9.4s, v2.s[2]\n"
      "fmla v27.4s, v9.4s, v3.s[2]\n"
      "fmla v31.4s, v9.4s, v4.s[2]\n"
      "ldr q9, [x27, #0x40]\n"
      "fmla v12.4s, v10.4s, v0.s[3]\n"
      "fmla v16.4s, v10.4s, v1.s[3]\n"
      "fmla v20.4s, v10.4s, v2.s[3]\n"
      "fmla v24.4s, v10.4s, v3.s[3]\n"
      "fmla v28.4s, v10.4s, v4.s[3]\n"
      "ldr q10, [x27, #0x50]\n"
      "fmla v13.4s, v11.4s, v0.s[3]\n"
      "fmla v17.4s, v11.4s, v1.s[3]\n"
      "fmla v21.4s, v11.4s, v2.s[3]\n"
      "fmla v25.4s, v11.4s, v3.s[3]\n"
      "fmla v29.4s, v11.4s, v4.s[3]\n"
      "ldr q11, [x27, #0x60]\n"
      "fmla v14.4s, v5.4s, v0.s[3]\n"
      "fmla v18.4s, v5.4s, v1.s[3]\n"
      "fmla v22.4s, v5.4s, v2.s[3]\n"
      "fmla v26.4s, v5.4s, v3.s[3]\n"
      "fmla v30.4s, v5.4s, v4.s[3]\n"
      "ldr q5, [x27, #0x0]\n"
      "fmla v15.4s, v6.4s, v0.s[3]\n"
      "ldr q0, [x24, #0x0]\n"
      "fmla v19.4s, v6.4s, v1.s[3]\n"
      "ldr q1, [x23, #0x0]\n"
      "fmla v23.4s, v6.4s, v2.s[3]\n"
      "ldr q2, [x22, #0x0]\n"
      "fmla v27.4s, v6.4s, v3.s[3]\n"
      "ldr q3, [x21, #0x0]\n"
      "fmla v31.4s, v6.4s, v4.s[3]\n"
      "ldr q4, [x20, #0x0]\n"
      "ldr q6, [x27, #0x10]\n"
      "bge 5b\n"
      "6:" // Height 5: Multiply loop: Single iteration only
      "fmla v12.4s, v5.4s, v0.s[0]\n"
      "fmla v16.4s, v5.4s, v1.s[0]\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "fmla v20.4s, v5.4s, v2.s[0]\n"
      "fmla v24.4s, v5.4s, v3.s[0]\n"
      "add x22, x22, #0x10\n"
      "add x21, x21, #0x10\n"
      "fmla v28.4s, v5.4s, v4.s[0]\n"
      "ldr q5, [x27, #0x70]\n"
      "fmla v13.4s, v6.4s, v0.s[0]\n"
      "add x20, x20, #0x10\n"
      "fmla v17.4s, v6.4s, v1.s[0]\n"
      "fmla v21.4s, v6.4s, v2.s[0]\n"
      "sub x25, x25, #0x4\n"
      "prfm pldl1keep, [x24, #0x80]\n"
      "fmla v25.4s, v6.4s, v3.s[0]\n"
      "fmla v29.4s, v6.4s, v4.s[0]\n"
      "ldr q6, [x27, #0x80]\n"
      "prfm pldl1keep, [x23, #0x80]\n"
      "fmla v14.4s, v7.4s, v0.s[0]\n"
      "fmla v18.4s, v7.4s, v1.s[0]\n"
      "prfm pldl1keep, [x22, #0x80]\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "fmla v22.4s, v7.4s, v2.s[0]\n"
      "fmla v26.4s, v7.4s, v3.s[0]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v30.4s, v7.4s, v4.s[0]\n"
      "ldr q7, [x27, #0x90]\n"
      "fmla v15.4s, v8.4s, v0.s[0]\n"
      "fmla v19.4s, v8.4s, v1.s[0]\n"
      "fmla v23.4s, v8.4s, v2.s[0]\n"
      "fmla v27.4s, v8.4s, v3.s[0]\n"
      "fmla v31.4s, v8.4s, v4.s[0]\n"
      "ldr q8, [x27, #0xa0]\n"
      "fmla v12.4s, v9.4s, v0.s[1]\n"
      "fmla v16.4s, v9.4s, v1.s[1]\n"
      "fmla v20.4s, v9.4s, v2.s[1]\n"
      "fmla v24.4s, v9.4s, v3.s[1]\n"
      "fmla v28.4s, v9.4s, v4.s[1]\n"
      "ldr q9, [x27, #0xb0]\n"
      "fmla v13.4s, v10.4s, v0.s[1]\n"
      "fmla v17.4s, v10.4s, v1.s[1]\n"
      "fmla v21.4s, v10.4s, v2.s[1]\n"
      "fmla v25.4s, v10.4s, v3.s[1]\n"
      "fmla v29.4s, v10.4s, v4.s[1]\n"
      "ldr q10, [x27, #0xc0]\n"
      "fmla v14.4s, v11.4s, v0.s[1]\n"
      "fmla v18.4s, v11.4s, v1.s[1]\n"
      "fmla v22.4s, v11.4s, v2.s[1]\n"
      "fmla v26.4s, v11.4s, v3.s[1]\n"
      "fmla v30.4s, v11.4s, v4.s[1]\n"
      "ldr q11, [x27, #0xd0]\n"
      "fmla v15.4s, v5.4s, v0.s[1]\n"
      "fmla v19.4s, v5.4s, v1.s[1]\n"
      "fmla v23.4s, v5.4s, v2.s[1]\n"
      "fmla v27.4s, v5.4s, v3.s[1]\n"
      "fmla v31.4s, v5.4s, v4.s[1]\n"
      "ldr q5, [x27, #0xe0]\n"
      "fmla v12.4s, v6.4s, v0.s[2]\n"
      "fmla v16.4s, v6.4s, v1.s[2]\n"
      "fmla v20.4s, v6.4s, v2.s[2]\n"
      "fmla v24.4s, v6.4s, v3.s[2]\n"
      "fmla v28.4s, v6.4s, v4.s[2]\n"
      "ldr q6, [x27, #0xf0]\n"
      "fmla v13.4s, v7.4s, v0.s[2]\n"
      "add x27, x27, #0x100\n"
      "fmla v17.4s, v7.4s, v1.s[2]\n"
      "fmla v21.4s, v7.4s, v2.s[2]\n"
      "fmla v25.4s, v7.4s, v3.s[2]\n"
      "fmla v29.4s, v7.4s, v4.s[2]\n"
      "fmla v14.4s, v8.4s, v0.s[2]\n"
      "fmla v18.4s, v8.4s, v1.s[2]\n"
      "fmla v22.4s, v8.4s, v2.s[2]\n"
      "fmla v26.4s, v8.4s, v3.s[2]\n"
      "fmla v30.4s, v8.4s, v4.s[2]\n"
      "fmla v15.4s, v9.4s, v0.s[2]\n"
      "fmla v19.4s, v9.4s, v1.s[2]\n"
      "fmla v23.4s, v9.4s, v2.s[2]\n"
      "fmla v27.4s, v9.4s, v3.s[2]\n"
      "fmla v31.4s, v9.4s, v4.s[2]\n"
      "fmla v12.4s, v10.4s, v0.s[3]\n"
      "fmla v16.4s, v10.4s, v1.s[3]\n"
      "fmla v20.4s, v10.4s, v2.s[3]\n"
      "fmla v24.4s, v10.4s, v3.s[3]\n"
      "fmla v28.4s, v10.4s, v4.s[3]\n"
      "fmla v13.4s, v11.4s, v0.s[3]\n"
      "fmla v17.4s, v11.4s, v1.s[3]\n"
      "fmla v21.4s, v11.4s, v2.s[3]\n"
      "fmla v25.4s, v11.4s, v3.s[3]\n"
      "fmla v29.4s, v11.4s, v4.s[3]\n"
      "fmla v14.4s, v5.4s, v0.s[3]\n"
      "fmla v18.4s, v5.4s, v1.s[3]\n"
      "fmla v22.4s, v5.4s, v2.s[3]\n"
      "fmla v26.4s, v5.4s, v3.s[3]\n"
      "fmla v30.4s, v5.4s, v4.s[3]\n"
      "fmla v15.4s, v6.4s, v0.s[3]\n"
      "fmla v19.4s, v6.4s, v1.s[3]\n"
      "fmla v23.4s, v6.4s, v2.s[3]\n"
      "fmla v27.4s, v6.4s, v3.s[3]\n"
      "fmla v31.4s, v6.4s, v4.s[3]\n"
      "7:" // Height 5: Multiply loop: Main loop skip
      "cbz x25, 9f\n"
      "8:" // Height 5: Multiply loop: Odd block loop
      "ldr s0, [x24], #0x4\n"
      "ldr s1, [x23], #0x4\n"
      "sub x25, x25, #0x1\n"
      "ldr s2, [x22], #0x4\n"
      "ldr s3, [x21], #0x4\n"
      "ldr s4, [x20], #0x4\n"
      "ldr q7, [x27, #0x0]\n"
      "ldr q8, [x27, #0x10]\n"
      "ldr q9, [x27, #0x20]\n"
      "ldr q10, [x27, #0x30]\n"
      "add x27, x27, #0x40\n"
      "fmla v12.4s, v7.4s, v0.s[0]\n"
      "fmla v16.4s, v7.4s, v1.s[0]\n"
      "fmla v20.4s, v7.4s, v2.s[0]\n"
      "fmla v24.4s, v7.4s, v3.s[0]\n"
      "fmla v28.4s, v7.4s, v4.s[0]\n"
      "fmla v13.4s, v8.4s, v0.s[0]\n"
      "fmla v17.4s, v8.4s, v1.s[0]\n"
      "fmla v21.4s, v8.4s, v2.s[0]\n"
      "fmla v25.4s, v8.4s, v3.s[0]\n"
      "fmla v29.4s, v8.4s, v4.s[0]\n"
      "fmla v14.4s, v9.4s, v0.s[0]\n"
      "fmla v18.4s, v9.4s, v1.s[0]\n"
      "fmla v22.4s, v9.4s, v2.s[0]\n"
      "fmla v26.4s, v9.4s, v3.s[0]\n"
      "fmla v30.4s, v9.4s, v4.s[0]\n"
      "fmla v15.4s, v10.4s, v0.s[0]\n"
      "fmla v19.4s, v10.4s, v1.s[0]\n"
      "fmla v23.4s, v10.4s, v2.s[0]\n"
      "fmla v27.4s, v10.4s, v3.s[0]\n"
      "fmla v31.4s, v10.4s, v4.s[0]\n"
      "cbnz x25, 8b\n"
      "9:" // Height 5: Multiply loop: No odd multiplies
      "ldr x20, [%x[gp], %[offsetof_ldc]]\n"
      "prfm pstl1keep, [x26, #0x0]\n"
      "str q12, [x26, #0x0]\n"
      "str q13, [x26, #0x10]\n"
      "str q14, [x26, #0x20]\n"
      "add x23, x26, x20\n"
      "add x22, x23, x20\n"
      "add x21, x22, x20\n"
      "prfm pstl1keep, [x23, #0x0]\n"
      "prfm pstl1keep, [x22, #0x0]\n"
      "str q15, [x26, #0x30]\n"
      "add x20, x21, x20\n"
      "prfm pstl1keep, [x21, #0x0]\n"
      "str q16, [x23, #0x0]\n"
      "add x26, x26, #0x40\n"
      "prfm pstl1keep, [x20, #0x0]\n"
      "str q17, [x23, #0x10]\n"
      "str q18, [x23, #0x20]\n"
      "str q19, [x23, #0x30]\n"
      "str q20, [x22, #0x0]\n"
      "str q21, [x22, #0x10]\n"
      "str q22, [x22, #0x20]\n"
      "str q23, [x22, #0x30]\n"
      "str q24, [x21, #0x0]\n"
      "str q25, [x21, #0x10]\n"
      "str q26, [x21, #0x20]\n"
      "str q27, [x21, #0x30]\n"
      "str q28, [x20, #0x0]\n"
      "str q29, [x20, #0x10]\n"
      "str q30, [x20, #0x20]\n"
      "str q31, [x20, #0x30]\n"
      "subs x28, x28, #0x1\n"
      "bgt 1b\n"
      :
      : [gp] "r"(gp),
        [offsetof_A] "I"(offsetof(GemmParamsFP32, A)),
        [offsetof_B] "I"(offsetof(GemmParamsFP32, B)),
        [offsetof_C] "I"(offsetof(GemmParamsFP32, C)),
        [offsetof_b_block_cols] "I"(offsetof(GemmParamsFP32, b_block_cols)),
        [offsetof_beta] "I"(offsetof(GemmParamsFP32, beta)),
        [offsetof_k] "I"(offsetof(GemmParamsFP32, k)),
        [offsetof_lda] "I"(offsetof(GemmParamsFP32, lda)),
        [offsetof_ldc] "I"(offsetof(GemmParamsFP32, ldc))
      : "cc",
        "memory",
        "v0",
        "v1",
        "v10",
        "v11",
        "v12",
        "v13",
        "v14",
        "v15",
        "v16",
        "v17",
        "v18",
        "v19",
        "v2",
        "v20",
        "v21",
        "v22",
        "v23",
        "v24",
        "v25",
        "v26",
        "v27",
        "v28",
        "v29",
        "v3",
        "v30",
        "v31",
        "v4",
        "v5",
        "v6",
        "v7",
        "v8",
        "v9",
        "x20",
        "x21",
        "x22",
        "x23",
        "x24",
        "x25",
        "x26",
        "x27",
        "x28",
        "x9");
#endif // __aarch64__
}

void NOINLINE gemmkernel_6x2_Neon_fp32_fA0fB0fC0(GemmParamsFP32* gp) {
#ifdef __aarch64__
  __asm__ __volatile__(
      "ldr s16, [%x[gp], %[offsetof_beta]]\n"
      "mov x10, #0x1\n"
      "ldr x9, [%x[gp], %[offsetof_b_block_cols]]\n"
      "ldr x28, [%x[gp], %[offsetof_B]]\n"
      "ldr x27, [%x[gp], %[offsetof_C]]\n"
      "fcmp s16, #0.0\n"
      "csel x10, XZR, x10, EQ\n"
      "csel x10, XZR, x10, VS\n"
      "1:" // Height 6: Column loop
      "tbz x10, #0, 2f\n"
      "ldr q8, [x27, #0x0]\n"
      "ldr q9, [x27, #0x10]\n"
      "add x20, %x[gp], %[offsetof_beta]\n"
      "ldr q10, [x27, #0x20]\n"
      "ldr q11, [x27, #0x30]\n"
      "ld1r { v0.4s }, [x20]\n"
      "ldr x23, [%x[gp], %[offsetof_ldc]]\n"
      "add x20, x27, x23\n"
      "ldr q12, [x20, #0x0]\n"
      "ldr q13, [x20, #0x10]\n"
      "ldr q14, [x20, #0x20]\n"
      "ldr q15, [x20, #0x30]\n"
      "add x20, x20, x23\n"
      "ldr q16, [x20, #0x0]\n"
      "ldr q17, [x20, #0x10]\n"
      "add x22, x20, x23\n"
      "fmul v8.4s, v8.4s, v0.4s\n"
      "ldr q18, [x20, #0x20]\n"
      "ldr q19, [x20, #0x30]\n"
      "add x21, x22, x23\n"
      "fmul v9.4s, v9.4s, v0.4s\n"
      "ldr q20, [x22, #0x0]\n"
      "ldr q21, [x22, #0x10]\n"
      "add x20, x21, x23\n"
      "fmul v10.4s, v10.4s, v0.4s\n"
      "ldr q22, [x22, #0x20]\n"
      "ldr q23, [x22, #0x30]\n"
      "fmul v11.4s, v11.4s, v0.4s\n"
      "fmul v12.4s, v12.4s, v0.4s\n"
      "ldr q24, [x21, #0x0]\n"
      "ldr q25, [x21, #0x10]\n"
      "fmul v13.4s, v13.4s, v0.4s\n"
      "fmul v14.4s, v14.4s, v0.4s\n"
      "ldr q26, [x21, #0x20]\n"
      "ldr q27, [x21, #0x30]\n"
      "fmul v15.4s, v15.4s, v0.4s\n"
      "fmul v16.4s, v16.4s, v0.4s\n"
      "ldr q28, [x20, #0x0]\n"
      "ldr q29, [x20, #0x10]\n"
      "fmul v17.4s, v17.4s, v0.4s\n"
      "fmul v18.4s, v18.4s, v0.4s\n"
      "ldr q30, [x20, #0x20]\n"
      "ldr q31, [x20, #0x30]\n"
      "fmul v19.4s, v19.4s, v0.4s\n"
      "fmul v20.4s, v20.4s, v0.4s\n"
      "fmul v21.4s, v21.4s, v0.4s\n"
      "fmul v22.4s, v22.4s, v0.4s\n"
      "fmul v23.4s, v23.4s, v0.4s\n"
      "fmul v24.4s, v24.4s, v0.4s\n"
      "fmul v25.4s, v25.4s, v0.4s\n"
      "fmul v26.4s, v26.4s, v0.4s\n"
      "fmul v27.4s, v27.4s, v0.4s\n"
      "fmul v28.4s, v28.4s, v0.4s\n"
      "fmul v29.4s, v29.4s, v0.4s\n"
      "fmul v30.4s, v30.4s, v0.4s\n"
      "fmul v31.4s, v31.4s, v0.4s\n"
      "b 3f\n"
      "2:" // Height 6: no accumulate
      "movi v8.16b, #0x0\n"
      "movi v9.16b, #0x0\n"
      "movi v10.16b, #0x0\n"
      "movi v11.16b, #0x0\n"
      "movi v12.16b, #0x0\n"
      "movi v13.16b, #0x0\n"
      "movi v14.16b, #0x0\n"
      "movi v15.16b, #0x0\n"
      "movi v16.16b, #0x0\n"
      "movi v17.16b, #0x0\n"
      "movi v18.16b, #0x0\n"
      "movi v19.16b, #0x0\n"
      "movi v20.16b, #0x0\n"
      "movi v21.16b, #0x0\n"
      "movi v22.16b, #0x0\n"
      "movi v23.16b, #0x0\n"
      "movi v24.16b, #0x0\n"
      "movi v25.16b, #0x0\n"
      "movi v26.16b, #0x0\n"
      "movi v27.16b, #0x0\n"
      "movi v28.16b, #0x0\n"
      "movi v29.16b, #0x0\n"
      "movi v30.16b, #0x0\n"
      "movi v31.16b, #0x0\n"
      "3:" // Height 6: setup done
      "ldr x21, [%x[gp], %[offsetof_A]]\n"
      "ldr x20, [%x[gp], %[offsetof_lda]]\n"
      "ldr x26, [%x[gp], %[offsetof_k]]\n"
      "mov x25, x21\n"
      "add x24, x25, x20\n"
      "add x23, x24, x20\n"
      "add x22, x23, x20\n"
      "add x21, x22, x20\n"
      "add x20, x21, x20\n"
      "cmp x26, #0x4\n"
      "blt 7f\n"
      "ldr q0, [x25, #0x0]\n"
      "ldr q1, [x24, #0x0]\n"
      "cmp x26, #0x8\n"
      "ldr q2, [x23, #0x0]\n"
      "ldr q3, [x22, #0x0]\n"
      "ldr q4, [x21, #0x0]\n"
      "ldr q5, [x20, #0x0]\n"
      "ldr q6, [x28, #0x0]\n"
      "ldr q7, [x28, #0x10]\n"
      "blt 6f\n"
      "5:" // Height 6: Multiply loop: Main loop head
      "fmla v8.4s, v6.4s, v0.s[0]\n"
      "fmla v12.4s, v6.4s, v1.s[0]\n"
      "sub x26, x26, #0x4\n"
      "add x25, x25, #0x10\n"
      "fmla v16.4s, v6.4s, v2.s[0]\n"
      "fmla v20.4s, v6.4s, v3.s[0]\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "fmla v24.4s, v6.4s, v4.s[0]\n"
      "fmla v28.4s, v6.4s, v5.s[0]\n"
      "ldr q6, [x28, #0x20]\n"
      "add x22, x22, #0x10\n"
      "fmla v9.4s, v7.4s, v0.s[0]\n"
      "fmla v13.4s, v7.4s, v1.s[0]\n"
      "add x21, x21, #0x10\n"
      "add x20, x20, #0x10\n"
      "fmla v17.4s, v7.4s, v2.s[0]\n"
      "fmla v21.4s, v7.4s, v3.s[0]\n"
      "cmp x26, #0x8\n"
      "prfm pldl1keep, [x25, #0x80]\n"
      "fmla v25.4s, v7.4s, v4.s[0]\n"
      "fmla v29.4s, v7.4s, v5.s[0]\n"
      "ldr q7, [x28, #0x30]\n"
      "prfm pldl1keep, [x24, #0x80]\n"
      "fmla v10.4s, v6.4s, v0.s[0]\n"
      "fmla v14.4s, v6.4s, v1.s[0]\n"
      "prfm pldl1keep, [x23, #0x80]\n"
      "prfm pldl1keep, [x22, #0x80]\n"
      "fmla v18.4s, v6.4s, v2.s[0]\n"
      "fmla v22.4s, v6.4s, v3.s[0]\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v26.4s, v6.4s, v4.s[0]\n"
      "fmla v30.4s, v6.4s, v5.s[0]\n"
      "ldr q6, [x28, #0x40]\n"
      "fmla v11.4s, v7.4s, v0.s[0]\n"
      "fmla v15.4s, v7.4s, v1.s[0]\n"
      "fmla v19.4s, v7.4s, v2.s[0]\n"
      "fmla v23.4s, v7.4s, v3.s[0]\n"
      "fmla v27.4s, v7.4s, v4.s[0]\n"
      "fmla v31.4s, v7.4s, v5.s[0]\n"
      "ldr q7, [x28, #0x50]\n"
      "fmla v8.4s, v6.4s, v0.s[1]\n"
      "fmla v12.4s, v6.4s, v1.s[1]\n"
      "fmla v16.4s, v6.4s, v2.s[1]\n"
      "fmla v20.4s, v6.4s, v3.s[1]\n"
      "fmla v24.4s, v6.4s, v4.s[1]\n"
      "fmla v28.4s, v6.4s, v5.s[1]\n"
      "ldr q6, [x28, #0x60]\n"
      "fmla v9.4s, v7.4s, v0.s[1]\n"
      "fmla v13.4s, v7.4s, v1.s[1]\n"
      "fmla v17.4s, v7.4s, v2.s[1]\n"
      "fmla v21.4s, v7.4s, v3.s[1]\n"
      "fmla v25.4s, v7.4s, v4.s[1]\n"
      "fmla v29.4s, v7.4s, v5.s[1]\n"
      "ldr q7, [x28, #0x70]\n"
      "fmla v10.4s, v6.4s, v0.s[1]\n"
      "fmla v14.4s, v6.4s, v1.s[1]\n"
      "fmla v18.4s, v6.4s, v2.s[1]\n"
      "fmla v22.4s, v6.4s, v3.s[1]\n"
      "fmla v26.4s, v6.4s, v4.s[1]\n"
      "fmla v30.4s, v6.4s, v5.s[1]\n"
      "ldr q6, [x28, #0x80]\n"
      "fmla v11.4s, v7.4s, v0.s[1]\n"
      "fmla v15.4s, v7.4s, v1.s[1]\n"
      "fmla v19.4s, v7.4s, v2.s[1]\n"
      "fmla v23.4s, v7.4s, v3.s[1]\n"
      "fmla v27.4s, v7.4s, v4.s[1]\n"
      "fmla v31.4s, v7.4s, v5.s[1]\n"
      "ldr q7, [x28, #0x90]\n"
      "fmla v8.4s, v6.4s, v0.s[2]\n"
      "fmla v12.4s, v6.4s, v1.s[2]\n"
      "fmla v16.4s, v6.4s, v2.s[2]\n"
      "fmla v20.4s, v6.4s, v3.s[2]\n"
      "fmla v24.4s, v6.4s, v4.s[2]\n"
      "fmla v28.4s, v6.4s, v5.s[2]\n"
      "ldr q6, [x28, #0xa0]\n"
      "fmla v9.4s, v7.4s, v0.s[2]\n"
      "fmla v13.4s, v7.4s, v1.s[2]\n"
      "fmla v17.4s, v7.4s, v2.s[2]\n"
      "fmla v21.4s, v7.4s, v3.s[2]\n"
      "fmla v25.4s, v7.4s, v4.s[2]\n"
      "fmla v29.4s, v7.4s, v5.s[2]\n"
      "ldr q7, [x28, #0xb0]\n"
      "fmla v10.4s, v6.4s, v0.s[2]\n"
      "fmla v14.4s, v6.4s, v1.s[2]\n"
      "fmla v18.4s, v6.4s, v2.s[2]\n"
      "fmla v22.4s, v6.4s, v3.s[2]\n"
      "fmla v26.4s, v6.4s, v4.s[2]\n"
      "fmla v30.4s, v6.4s, v5.s[2]\n"
      "ldr q6, [x28, #0xc0]\n"
      "fmla v11.4s, v7.4s, v0.s[2]\n"
      "fmla v15.4s, v7.4s, v1.s[2]\n"
      "fmla v19.4s, v7.4s, v2.s[2]\n"
      "fmla v23.4s, v7.4s, v3.s[2]\n"
      "fmla v27.4s, v7.4s, v4.s[2]\n"
      "fmla v31.4s, v7.4s, v5.s[2]\n"
      "ldr q7, [x28, #0xd0]\n"
      "fmla v8.4s, v6.4s, v0.s[3]\n"
      "fmla v12.4s, v6.4s, v1.s[3]\n"
      "fmla v16.4s, v6.4s, v2.s[3]\n"
      "fmla v20.4s, v6.4s, v3.s[3]\n"
      "fmla v24.4s, v6.4s, v4.s[3]\n"
      "fmla v28.4s, v6.4s, v5.s[3]\n"
      "ldr q6, [x28, #0xe0]\n"
      "fmla v9.4s, v7.4s, v0.s[3]\n"
      "fmla v13.4s, v7.4s, v1.s[3]\n"
      "fmla v17.4s, v7.4s, v2.s[3]\n"
      "fmla v21.4s, v7.4s, v3.s[3]\n"
      "fmla v25.4s, v7.4s, v4.s[3]\n"
      "fmla v29.4s, v7.4s, v5.s[3]\n"
      "ldr q7, [x28, #0xf0]\n"
      "add x28, x28, #0x100\n"
      "fmla v10.4s, v6.4s, v0.s[3]\n"
      "fmla v14.4s, v6.4s, v1.s[3]\n"
      "fmla v18.4s, v6.4s, v2.s[3]\n"
      "fmla v22.4s, v6.4s, v3.s[3]\n"
      "fmla v26.4s, v6.4s, v4.s[3]\n"
      "fmla v30.4s, v6.4s, v5.s[3]\n"
      "ldr q6, [x28, #0x0]\n"
      "fmla v11.4s, v7.4s, v0.s[3]\n"
      "ldr q0, [x25, #0x0]\n"
      "fmla v15.4s, v7.4s, v1.s[3]\n"
      "ldr q1, [x24, #0x0]\n"
      "fmla v19.4s, v7.4s, v2.s[3]\n"
      "ldr q2, [x23, #0x0]\n"
      "fmla v23.4s, v7.4s, v3.s[3]\n"
      "ldr q3, [x22, #0x0]\n"
      "fmla v27.4s, v7.4s, v4.s[3]\n"
      "ldr q4, [x21, #0x0]\n"
      "fmla v31.4s, v7.4s, v5.s[3]\n"
      "ldr q5, [x20, #0x0]\n"
      "ldr q7, [x28, #0x10]\n"
      "bge 5b\n"
      "6:" // Height 6: Multiply loop: Single iteration only
      "fmla v8.4s, v6.4s, v0.s[0]\n"
      "fmla v12.4s, v6.4s, v1.s[0]\n"
      "add x25, x25, #0x10\n"
      "add x24, x24, #0x10\n"
      "fmla v16.4s, v6.4s, v2.s[0]\n"
      "fmla v20.4s, v6.4s, v3.s[0]\n"
      "add x23, x23, #0x10\n"
      "add x22, x22, #0x10\n"
      "fmla v24.4s, v6.4s, v4.s[0]\n"
      "fmla v28.4s, v6.4s, v5.s[0]\n"
      "ldr q6, [x28, #0x20]\n"
      "add x21, x21, #0x10\n"
      "fmla v9.4s, v7.4s, v0.s[0]\n"
      "fmla v13.4s, v7.4s, v1.s[0]\n"
      "add x20, x20, #0x10\n"
      "prfm pldl1keep, [x25, #0x80]\n"
      "fmla v17.4s, v7.4s, v2.s[0]\n"
      "fmla v21.4s, v7.4s, v3.s[0]\n"
      "prfm pldl1keep, [x24, #0x80]\n"
      "prfm pldl1keep, [x23, #0x80]\n"
      "fmla v25.4s, v7.4s, v4.s[0]\n"
      "fmla v29.4s, v7.4s, v5.s[0]\n"
      "ldr q7, [x28, #0x30]\n"
      "prfm pldl1keep, [x22, #0x80]\n"
      "fmla v10.4s, v6.4s, v0.s[0]\n"
      "fmla v14.4s, v6.4s, v1.s[0]\n"
      "sub x26, x26, #0x4\n"
      "prfm pldl1keep, [x21, #0x80]\n"
      "fmla v18.4s, v6.4s, v2.s[0]\n"
      "fmla v22.4s, v6.4s, v3.s[0]\n"
      "prfm pldl1keep, [x20, #0x80]\n"
      "fmla v26.4s, v6.4s, v4.s[0]\n"
      "fmla v30.4s, v6.4s, v5.s[0]\n"
      "ldr q6, [x28, #0x40]\n"
      "fmla v11.4s, v7.4s, v0.s[0]\n"
      "fmla v15.4s, v7.4s, v1.s[0]\n"
      "fmla v19.4s, v7.4s, v2.s[0]\n"
      "fmla v23.4s, v7.4s, v3.s[0]\n"
      "fmla v27.4s, v7.4s, v4.s[0]\n"
      "fmla v31.4s, v7.4s, v5.s[0]\n"
      "ldr q7, [x28, #0x50]\n"
      "fmla v8.4s, v6.4s, v0.s[1]\n"
      "fmla v12.4s, v6.4s, v1.s[1]\n"
      "fmla v16.4s, v6.4s, v2.s[1]\n"
      "fmla v20.4s, v6.4s, v3.s[1]\n"
      "fmla v24.4s, v6.4s, v4.s[1]\n"
      "fmla v28.4s, v6.4s, v5.s[1]\n"
      "ldr q6, [x28, #0x60]\n"
      "fmla v9.4s, v7.4s, v0.s[1]\n"
      "fmla v13.4s, v7.4s, v1.s[1]\n"
      "fmla v17.4s, v7.4s, v2.s[1]\n"
      "fmla v21.4s, v7.4s, v3.s[1]\n"
      "fmla v25.4s, v7.4s, v4.s[1]\n"
      "fmla v29.4s, v7.4s, v5.s[1]\n"
      "ldr q7, [x28, #0x70]\n"
      "fmla v10.4s, v6.4s, v0.s[1]\n"
      "fmla v14.4s, v6.4s, v1.s[1]\n"
      "fmla v18.4s, v6.4s, v2.s[1]\n"
      "fmla v22.4s, v6.4s, v3.s[1]\n"
      "fmla v26.4s, v6.4s, v4.s[1]\n"
      "fmla v30.4s, v6.4s, v5.s[1]\n"
      "ldr q6, [x28, #0x80]\n"
      "fmla v11.4s, v7.4s, v0.s[1]\n"
      "fmla v15.4s, v7.4s, v1.s[1]\n"
      "fmla v19.4s, v7.4s, v2.s[1]\n"
      "fmla v23.4s, v7.4s, v3.s[1]\n"
      "fmla v27.4s, v7.4s, v4.s[1]\n"
      "fmla v31.4s, v7.4s, v5.s[1]\n"
      "ldr q7, [x28, #0x90]\n"
      "fmla v8.4s, v6.4s, v0.s[2]\n"
      "fmla v12.4s, v6.4s, v1.s[2]\n"
      "fmla v16.4s, v6.4s, v2.s[2]\n"
      "fmla v20.4s, v6.4s, v3.s[2]\n"
      "fmla v24.4s, v6.4s, v4.s[2]\n"
      "fmla v28.4s, v6.4s, v5.s[2]\n"
      "ldr q6, [x28, #0xa0]\n"
      "fmla v9.4s, v7.4s, v0.s[2]\n"
      "fmla v13.4s, v7.4s, v1.s[2]\n"
      "fmla v17.4s, v7.4s, v2.s[2]\n"
      "fmla v21.4s, v7.4s, v3.s[2]\n"
      "fmla v25.4s, v7.4s, v4.s[2]\n"
      "fmla v29.4s, v7.4s, v5.s[2]\n"
      "ldr q7, [x28, #0xb0]\n"
      "fmla v10.4s, v6.4s, v0.s[2]\n"
      "fmla v14.4s, v6.4s, v1.s[2]\n"
      "fmla v18.4s, v6.4s, v2.s[2]\n"
      "fmla v22.4s, v6.4s, v3.s[2]\n"
      "fmla v26.4s, v6.4s, v4.s[2]\n"
      "fmla v30.4s, v6.4s, v5.s[2]\n"
      "ldr q6, [x28, #0xc0]\n"
      "fmla v11.4s, v7.4s, v0.s[2]\n"
      "fmla v15.4s, v7.4s, v1.s[2]\n"
      "fmla v19.4s, v7.4s, v2.s[2]\n"
      "fmla v23.4s, v7.4s, v3.s[2]\n"
      "fmla v27.4s, v7.4s, v4.s[2]\n"
      "fmla v31.4s, v7.4s, v5.s[2]\n"
      "ldr q7, [x28, #0xd0]\n"
      "fmla v8.4s, v6.4s, v0.s[3]\n"
      "fmla v12.4s, v6.4s, v1.s[3]\n"
      "fmla v16.4s, v6.4s, v2.s[3]\n"
      "fmla v20.4s, v6.4s, v3.s[3]\n"
      "fmla v24.4s, v6.4s, v4.s[3]\n"
      "fmla v28.4s, v6.4s, v5.s[3]\n"
      "ldr q6, [x28, #0xe0]\n"
      "fmla v9.4s, v7.4s, v0.s[3]\n"
      "fmla v13.4s, v7.4s, v1.s[3]\n"
      "fmla v17.4s, v7.4s, v2.s[3]\n"
      "fmla v21.4s, v7.4s, v3.s[3]\n"
      "fmla v25.4s, v7.4s, v4.s[3]\n"
      "fmla v29.4s, v7.4s, v5.s[3]\n"
      "ldr q7, [x28, #0xf0]\n"
      "add x28, x28, #0x100\n"
      "fmla v10.4s, v6.4s, v0.s[3]\n"
      "fmla v14.4s, v6.4s, v1.s[3]\n"
      "fmla v18.4s, v6.4s, v2.s[3]\n"
      "fmla v22.4s, v6.4s, v3.s[3]\n"
      "fmla v26.4s, v6.4s, v4.s[3]\n"
      "fmla v30.4s, v6.4s, v5.s[3]\n"
      "fmla v11.4s, v7.4s, v0.s[3]\n"
      "fmla v15.4s, v7.4s, v1.s[3]\n"
      "fmla v19.4s, v7.4s, v2.s[3]\n"
      "fmla v23.4s, v7.4s, v3.s[3]\n"
      "fmla v27.4s, v7.4s, v4.s[3]\n"
      "fmla v31.4s, v7.4s, v5.s[3]\n"
      "7:" // Height 6: Multiply loop: Main loop skip
      "cbz x26, 9f\n"
      "8:" // Height 6: Multiply loop: Odd block loop
      "ldr s0, [x25], #0x4\n"
      "ldr s1, [x24], #0x4\n"
      "sub x26, x26, #0x1\n"
      "ldr s2, [x23], #0x4\n"
      "ldr s3, [x22], #0x4\n"
      "ldr s4, [x21], #0x4\n"
      "ldr s5, [x20], #0x4\n"
      "ldr q6, [x28, #0x0]\n"
      "ldr q7, [x28, #0x10]\n"
      "fmla v8.4s, v6.4s, v0.s[0]\n"
      "fmla v12.4s, v6.4s, v1.s[0]\n"
      "fmla v16.4s, v6.4s, v2.s[0]\n"
      "fmla v20.4s, v6.4s, v3.s[0]\n"
      "fmla v24.4s, v6.4s, v4.s[0]\n"
      "fmla v28.4s, v6.4s, v5.s[0]\n"
      "ldr q6, [x28, #0x20]\n"
      "fmla v9.4s, v7.4s, v0.s[0]\n"
      "fmla v13.4s, v7.4s, v1.s[0]\n"
      "fmla v17.4s, v7.4s, v2.s[0]\n"
      "fmla v21.4s, v7.4s, v3.s[0]\n"
      "fmla v25.4s, v7.4s, v4.s[0]\n"
      "fmla v29.4s, v7.4s, v5.s[0]\n"
      "ldr q7, [x28, #0x30]\n"
      "add x28, x28, #0x40\n"
      "fmla v10.4s, v6.4s, v0.s[0]\n"
      "fmla v14.4s, v6.4s, v1.s[0]\n"
      "fmla v18.4s, v6.4s, v2.s[0]\n"
      "fmla v22.4s, v6.4s, v3.s[0]\n"
      "fmla v26.4s, v6.4s, v4.s[0]\n"
      "fmla v30.4s, v6.4s, v5.s[0]\n"
      "fmla v11.4s, v7.4s, v0.s[0]\n"
      "fmla v15.4s, v7.4s, v1.s[0]\n"
      "fmla v19.4s, v7.4s, v2.s[0]\n"
      "fmla v23.4s, v7.4s, v3.s[0]\n"
      "fmla v27.4s, v7.4s, v4.s[0]\n"
      "fmla v31.4s, v7.4s, v5.s[0]\n"
      "cbnz x26, 8b\n"
      "9:" // Height 6: Multiply loop: No odd multiplies
      "ldr x20, [%x[gp], %[offsetof_ldc]]\n"
      "prfm pstl1keep, [x27, #0x0]\n"
      "str q8, [x27, #0x0]\n"
      "str q9, [x27, #0x10]\n"
      "str q10, [x27, #0x20]\n"
      "add x24, x27, x20\n"
      "add x23, x24, x20\n"
      "add x22, x23, x20\n"
      "prfm pstl1keep, [x24, #0x0]\n"
      "prfm pstl1keep, [x23, #0x0]\n"
      "str q11, [x27, #0x30]\n"
      "add x21, x22, x20\n"
      "prfm pstl1keep, [x22, #0x0]\n"
      "str q12, [x24, #0x0]\n"
      "add x27, x27, #0x40\n"
      "add x20, x21, x20\n"
      "prfm pstl1keep, [x21, #0x0]\n"
      "str q13, [x24, #0x10]\n"
      "prfm pstl1keep, [x20, #0x0]\n"
      "str q14, [x24, #0x20]\n"
      "str q15, [x24, #0x30]\n"
      "str q16, [x23, #0x0]\n"
      "str q17, [x23, #0x10]\n"
      "str q18, [x23, #0x20]\n"
      "str q19, [x23, #0x30]\n"
      "str q20, [x22, #0x0]\n"
      "str q21, [x22, #0x10]\n"
      "str q22, [x22, #0x20]\n"
      "str q23, [x22, #0x30]\n"
      "str q24, [x21, #0x0]\n"
      "str q25, [x21, #0x10]\n"
      "str q26, [x21, #0x20]\n"
      "str q27, [x21, #0x30]\n"
      "str q28, [x20, #0x0]\n"
      "str q29, [x20, #0x10]\n"
      "str q30, [x20, #0x20]\n"
      "str q31, [x20, #0x30]\n"
      "subs x9, x9, #0x1\n"
      "bgt 1b\n"
      :
      : [gp] "r"(gp),
        [offsetof_A] "I"(offsetof(GemmParamsFP32, A)),
        [offsetof_B] "I"(offsetof(GemmParamsFP32, B)),
        [offsetof_C] "I"(offsetof(GemmParamsFP32, C)),
        [offsetof_b_block_cols] "I"(offsetof(GemmParamsFP32, b_block_cols)),
        [offsetof_beta] "I"(offsetof(GemmParamsFP32, beta)),
        [offsetof_k] "I"(offsetof(GemmParamsFP32, k)),
        [offsetof_lda] "I"(offsetof(GemmParamsFP32, lda)),
        [offsetof_ldc] "I"(offsetof(GemmParamsFP32, ldc))
      : "cc",
        "memory",
        "v0",
        "v1",
        "v10",
        "v11",
        "v12",
        "v13",
        "v14",
        "v15",
        "v16",
        "v17",
        "v18",
        "v19",
        "v2",
        "v20",
        "v21",
        "v22",
        "v23",
        "v24",
        "v25",
        "v26",
        "v27",
        "v28",
        "v29",
        "v3",
        "v30",
        "v31",
        "v4",
        "v5",
        "v6",
        "v7",
        "v8",
        "v9",
        "x10",
        "x20",
        "x21",
        "x22",
        "x23",
        "x24",
        "x25",
        "x26",
        "x27",
        "x28",
        "x9");
#endif // __aarch64__
}

} // namespace kleidiai

#endif
